github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/tcpip/transport/tcp/endpoint.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "container/list" 19 "encoding/binary" 20 "fmt" 21 "io" 22 "math" 23 "math/rand" 24 "runtime" 25 "strings" 26 "sync/atomic" 27 "time" 28 29 "github.com/SagerNet/gvisor/pkg/sleep" 30 "github.com/SagerNet/gvisor/pkg/sync" 31 "github.com/SagerNet/gvisor/pkg/tcpip" 32 "github.com/SagerNet/gvisor/pkg/tcpip/hash/jenkins" 33 "github.com/SagerNet/gvisor/pkg/tcpip/header" 34 "github.com/SagerNet/gvisor/pkg/tcpip/ports" 35 "github.com/SagerNet/gvisor/pkg/tcpip/seqnum" 36 "github.com/SagerNet/gvisor/pkg/tcpip/stack" 37 "github.com/SagerNet/gvisor/pkg/waiter" 38 ) 39 40 // EndpointState represents the state of a TCP endpoint. 41 type EndpointState tcpip.EndpointState 42 43 // Endpoint states. Note that are represented in a netstack-specific manner and 44 // may not be meaningful externally. Specifically, they need to be translated to 45 // Linux's representation for these states if presented to userspace. 46 const ( 47 _ EndpointState = iota 48 // TCP protocol states in sync with the definitions in 49 // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/tcp_states.h#L13 50 StateEstablished 51 StateSynSent 52 StateSynRecv 53 StateFinWait1 54 StateFinWait2 55 StateTimeWait 56 StateClose 57 StateCloseWait 58 StateLastAck 59 StateListen 60 StateClosing 61 62 // Endpoint states internal to netstack. 63 StateInitial 64 StateBound 65 StateConnecting // Connect() called, but the initial SYN hasn't been sent. 66 StateError 67 ) 68 69 const ( 70 // rcvAdvWndScale is used to split the available socket buffer into 71 // application buffer and the window to be advertised to the peer. This is 72 // currently hard coded to split the available space equally. 73 rcvAdvWndScale = 1 74 75 // SegOverheadFactor is used to multiply the value provided by the 76 // user on a SetSockOpt for setting the socket send/receive buffer sizes. 77 SegOverheadFactor = 2 78 ) 79 80 // connected returns true when s is one of the states representing an 81 // endpoint connected to a peer. 82 func (s EndpointState) connected() bool { 83 switch s { 84 case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing: 85 return true 86 default: 87 return false 88 } 89 } 90 91 // connecting returns true when s is one of the states representing a 92 // connection in progress, but not yet fully established. 93 func (s EndpointState) connecting() bool { 94 switch s { 95 case StateConnecting, StateSynSent, StateSynRecv: 96 return true 97 default: 98 return false 99 } 100 } 101 102 // internal returns true when the state is netstack internal. 103 func (s EndpointState) internal() bool { 104 switch s { 105 case StateInitial, StateBound, StateConnecting, StateError: 106 return true 107 default: 108 return false 109 } 110 } 111 112 // handshake returns true when s is one of the states representing an endpoint 113 // in the middle of a TCP handshake. 114 func (s EndpointState) handshake() bool { 115 switch s { 116 case StateSynSent, StateSynRecv: 117 return true 118 default: 119 return false 120 } 121 } 122 123 // closed returns true when s is one of the states an endpoint transitions to 124 // when closed or when it encounters an error. This is distinct from a newly 125 // initialized endpoint that was never connected. 126 func (s EndpointState) closed() bool { 127 switch s { 128 case StateClose, StateError: 129 return true 130 default: 131 return false 132 } 133 } 134 135 // String implements fmt.Stringer.String. 136 func (s EndpointState) String() string { 137 switch s { 138 case StateInitial: 139 return "INITIAL" 140 case StateBound: 141 return "BOUND" 142 case StateConnecting: 143 return "CONNECTING" 144 case StateError: 145 return "ERROR" 146 case StateEstablished: 147 return "ESTABLISHED" 148 case StateSynSent: 149 return "SYN-SENT" 150 case StateSynRecv: 151 return "SYN-RCVD" 152 case StateFinWait1: 153 return "FIN-WAIT1" 154 case StateFinWait2: 155 return "FIN-WAIT2" 156 case StateTimeWait: 157 return "TIME-WAIT" 158 case StateClose: 159 return "CLOSED" 160 case StateCloseWait: 161 return "CLOSE-WAIT" 162 case StateLastAck: 163 return "LAST-ACK" 164 case StateListen: 165 return "LISTEN" 166 case StateClosing: 167 return "CLOSING" 168 default: 169 panic("unreachable") 170 } 171 } 172 173 // Reasons for notifying the protocol goroutine. 174 const ( 175 notifyNonZeroReceiveWindow = 1 << iota 176 notifyClose 177 notifyMTUChanged 178 notifyDrain 179 notifyReset 180 notifyResetByPeer 181 // notifyAbort is a request for an expedited teardown. 182 notifyAbort 183 notifyKeepaliveChanged 184 notifyMSSChanged 185 // notifyTickleWorker is used to tickle the protocol main loop during a 186 // restore after we update the endpoint state to the correct one. This 187 // ensures the loop terminates if the final state of the endpoint is 188 // say TIME_WAIT. 189 notifyTickleWorker 190 notifyError 191 ) 192 193 // SACKInfo holds TCP SACK related information for a given endpoint. 194 // 195 // +stateify savable 196 type SACKInfo struct { 197 // Blocks is the maximum number of SACK blocks we track 198 // per endpoint. 199 Blocks [MaxSACKBlocks]header.SACKBlock 200 201 // NumBlocks is the number of valid SACK blocks stored in the 202 // blocks array above. 203 NumBlocks int 204 } 205 206 // ReceiveErrors collect segment receive errors within transport layer. 207 type ReceiveErrors struct { 208 tcpip.ReceiveErrors 209 210 // SegmentQueueDropped is the number of segments dropped due to 211 // a full segment queue. 212 SegmentQueueDropped tcpip.StatCounter 213 214 // ChecksumErrors is the number of segments dropped due to bad checksums. 215 ChecksumErrors tcpip.StatCounter 216 217 // ListenOverflowSynDrop is the number of times the listen queue overflowed 218 // and a SYN was dropped. 219 ListenOverflowSynDrop tcpip.StatCounter 220 221 // ListenOverflowAckDrop is the number of times the final ACK 222 // in the handshake was dropped due to overflow. 223 ListenOverflowAckDrop tcpip.StatCounter 224 225 // ZeroRcvWindowState is the number of times we advertised 226 // a zero receive window when rcvQueue is full. 227 ZeroRcvWindowState tcpip.StatCounter 228 229 // WantZeroWindow is the number of times we wanted to advertise a 230 // zero receive window but couldn't because it would have caused 231 // the receive window's right edge to shrink. 232 WantZeroRcvWindow tcpip.StatCounter 233 } 234 235 // SendErrors collect segment send errors within the transport layer. 236 type SendErrors struct { 237 tcpip.SendErrors 238 239 // SegmentSendToNetworkFailed is the number of TCP segments failed to be sent 240 // to the network endpoint. 241 SegmentSendToNetworkFailed tcpip.StatCounter 242 243 // SynSendToNetworkFailed is the number of TCP SYNs failed to be sent 244 // to the network endpoint. 245 SynSendToNetworkFailed tcpip.StatCounter 246 247 // Retransmits is the number of TCP segments retransmitted. 248 Retransmits tcpip.StatCounter 249 250 // FastRetransmit is the number of segments retransmitted in fast 251 // recovery. 252 FastRetransmit tcpip.StatCounter 253 254 // Timeouts is the number of times the RTO expired. 255 Timeouts tcpip.StatCounter 256 } 257 258 // Stats holds statistics about the endpoint. 259 type Stats struct { 260 // SegmentsReceived is the number of TCP segments received that 261 // the transport layer successfully parsed. 262 SegmentsReceived tcpip.StatCounter 263 264 // SegmentsSent is the number of TCP segments sent. 265 SegmentsSent tcpip.StatCounter 266 267 // FailedConnectionAttempts is the number of times we saw Connect and 268 // Accept errors. 269 FailedConnectionAttempts tcpip.StatCounter 270 271 // ReceiveErrors collects segment receive errors within the 272 // transport layer. 273 ReceiveErrors ReceiveErrors 274 275 // ReadErrors collects segment read errors from an endpoint read call. 276 ReadErrors tcpip.ReadErrors 277 278 // SendErrors collects segment send errors within the transport layer. 279 SendErrors SendErrors 280 281 // WriteErrors collects segment write errors from an endpoint write call. 282 WriteErrors tcpip.WriteErrors 283 } 284 285 // IsEndpointStats is an empty method to implement the tcpip.EndpointStats 286 // marker interface. 287 func (*Stats) IsEndpointStats() {} 288 289 // sndQueueInfo implements a send queue. 290 // 291 // +stateify savable 292 type sndQueueInfo struct { 293 sndQueueMu sync.Mutex `state:"nosave"` 294 stack.TCPSndBufState 295 296 // sndWaker is used to signal the protocol goroutine when there may be 297 // segments that need to be sent. 298 sndWaker sleep.Waker `state:"manual"` 299 } 300 301 // rcvQueueInfo contains the endpoint's rcvQueue and associated metadata. 302 // 303 // +stateify savable 304 type rcvQueueInfo struct { 305 rcvQueueMu sync.Mutex `state:"nosave"` 306 stack.TCPRcvBufState 307 308 // rcvQueue is the queue for ready-for-delivery segments. This struct's 309 // mutex must be held in order append segments to list. 310 rcvQueue segmentList `state:"wait"` 311 } 312 313 // +stateify savable 314 type accepted struct { 315 // NB: this could be an endpointList, but ilist only permits endpoints to 316 // belong to one list at a time, and endpoints are already stored in the 317 // dispatcher's list. 318 endpoints list.List `state:".([]*endpoint)"` 319 cap int 320 } 321 322 // endpoint represents a TCP endpoint. This struct serves as the interface 323 // between users of the endpoint and the protocol implementation; it is legal to 324 // have concurrent goroutines make calls into the endpoint, they are properly 325 // synchronized. The protocol implementation, however, runs in a single 326 // goroutine. 327 // 328 // Each endpoint has a few mutexes: 329 // 330 // e.mu -> Primary mutex for an endpoint must be held for all operations except 331 // in e.Readiness where acquiring it will result in a deadlock in epoll 332 // implementation. 333 // 334 // The following three mutexes can be acquired independent of e.mu but if 335 // acquired with e.mu then e.mu must be acquired first. 336 // 337 // e.acceptMu -> protects accepted. 338 // e.rcvQueueMu -> Protects e.rcvQueue and associated fields. 339 // e.sndQueueMu -> Protects the e.sndQueue and associated fields. 340 // e.lastErrorMu -> Protects the lastError field. 341 // 342 // LOCKING/UNLOCKING of the endpoint. The locking of an endpoint is different 343 // based on the context in which the lock is acquired. In the syscall context 344 // e.LockUser/e.UnlockUser should be used and when doing background processing 345 // e.mu.Lock/e.mu.Unlock should be used. The distinction is described below 346 // in brief. 347 // 348 // The reason for this locking behaviour is to avoid wakeups to handle packets. 349 // In cases where the endpoint is already locked the background processor can 350 // queue the packet up and go its merry way and the lock owner will eventually 351 // process the backlog when releasing the lock. Similarly when acquiring the 352 // lock from say a syscall goroutine we can implement a bit of spinning if we 353 // know that the lock is not held by another syscall goroutine. Background 354 // processors should never hold the lock for long and we can avoid an expensive 355 // sleep/wakeup by spinning for a shortwhile. 356 // 357 // For more details please see the detailed documentation on 358 // e.LockUser/e.UnlockUser methods. 359 // 360 // +stateify savable 361 type endpoint struct { 362 stack.TCPEndpointStateInner 363 stack.TransportEndpointInfo 364 tcpip.DefaultSocketOptionsHandler 365 366 // endpointEntry is used to queue endpoints for processing to the 367 // a given tcp processor goroutine. 368 // 369 // Precondition: epQueue.mu must be held to read/write this field.. 370 endpointEntry `state:"nosave"` 371 372 // pendingProcessing is true if this endpoint is queued for processing 373 // to a TCP processor. 374 // 375 // Precondition: epQueue.mu must be held to read/write this field.. 376 pendingProcessing bool `state:"nosave"` 377 378 // The following fields are initialized at creation time and do not 379 // change throughout the lifetime of the endpoint. 380 stack *stack.Stack `state:"manual"` 381 waiterQueue *waiter.Queue `state:"wait"` 382 uniqueID uint64 383 384 // hardError is meaningful only when state is stateError. It stores the 385 // error to be returned when read/write syscalls are called and the 386 // endpoint is in this state. hardError is protected by endpoint mu. 387 hardError tcpip.Error 388 389 // lastError represents the last error that the endpoint reported; 390 // access to it is protected by the following mutex. 391 lastErrorMu sync.Mutex `state:"nosave"` 392 lastError tcpip.Error 393 394 // rcvReadMu synchronizes calls to Read. 395 // 396 // mu and rcvQueueMu are temporarily released during data copying. rcvReadMu 397 // must be held during each read to ensure atomicity, so that multiple reads 398 // do not interleave. 399 // 400 // rcvReadMu should be held before holding mu. 401 rcvReadMu sync.Mutex `state:"nosave"` 402 403 // rcvQueueInfo holds the implementation of the endpoint's receive buffer. 404 // The data within rcvQueueInfo should only be accessed while rcvReadMu, mu, 405 // and rcvQueueMu are held, in that stated order. While processing the segment 406 // range, you can determine a range and then temporarily release mu and 407 // rcvQueueMu, which allows new segments to be appended to the queue while 408 // processing. 409 rcvQueueInfo rcvQueueInfo 410 411 // rcvMemUsed tracks the total amount of memory in use by received segments 412 // held in rcvQueue, pendingRcvdSegments and the segment queue. This is used to 413 // compute the window and the actual available buffer space. This is distinct 414 // from rcvBufUsed above which is the actual number of payload bytes held in 415 // the buffer not including any segment overheads. 416 // 417 // rcvMemUsed must be accessed atomically. 418 rcvMemUsed int32 419 420 // mu protects all endpoint fields unless documented otherwise. mu must 421 // be acquired before interacting with the endpoint fields. 422 // 423 // During handshake, mu is locked by the protocol listen goroutine and 424 // released by the handshake completion goroutine. 425 mu sync.CrossGoroutineMutex `state:"nosave"` 426 ownedByUser uint32 427 428 // state must be read/set using the EndpointState()/setEndpointState() 429 // methods. 430 state uint32 `state:".(EndpointState)"` 431 432 // origEndpointState is only used during a restore phase to save the 433 // endpoint state at restore time as the socket is moved to it's correct 434 // state. 435 origEndpointState uint32 `state:"nosave"` 436 437 isPortReserved bool `state:"manual"` 438 isRegistered bool `state:"manual"` 439 boundNICID tcpip.NICID 440 route *stack.Route `state:"manual"` 441 ttl uint8 442 isConnectNotified bool 443 444 // h stores a reference to the current handshake state if the endpoint is in 445 // the SYN-SENT or SYN-RECV states, in which case endpoint == endpoint.h.ep. 446 // nil otherwise. 447 h *handshake `state:"nosave"` 448 449 // portFlags stores the current values of port related flags. 450 portFlags ports.Flags 451 452 // Values used to reserve a port or register a transport endpoint 453 // (which ever happens first). 454 boundBindToDevice tcpip.NICID 455 boundPortFlags ports.Flags 456 boundDest tcpip.FullAddress 457 458 // effectiveNetProtos contains the network protocols actually in use. In 459 // most cases it will only contain "netProto", but in cases like IPv6 460 // endpoints with v6only set to false, this could include multiple 461 // protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g., 462 // IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped 463 // address). 464 effectiveNetProtos []tcpip.NetworkProtocolNumber 465 466 // workerRunning specifies if a worker goroutine is running. 467 workerRunning bool 468 469 // workerCleanup specifies if the worker goroutine must perform cleanup 470 // before exiting. This can only be set to true when workerRunning is 471 // also true, and they're both protected by the mutex. 472 workerCleanup bool 473 474 // recentTSTime is the unix time when we last updated 475 // TCPEndpointStateInner.RecentTS. 476 recentTSTime tcpip.MonotonicTime 477 478 // shutdownFlags represent the current shutdown state of the endpoint. 479 shutdownFlags tcpip.ShutdownFlags 480 481 // tcpRecovery is the loss deteoction algorithm used by TCP. 482 tcpRecovery tcpip.TCPRecovery 483 484 // sack holds TCP SACK related information for this endpoint. 485 sack SACKInfo 486 487 // delay enables Nagle's algorithm. 488 // 489 // delay is a boolean (0 is false) and must be accessed atomically. 490 delay uint32 491 492 // scoreboard holds TCP SACK Scoreboard information for this endpoint. 493 scoreboard *SACKScoreboard 494 495 // segmentQueue is used to hand received segments to the protocol 496 // goroutine. Segments are queued as long as the queue is not full, 497 // and dropped when it is. 498 segmentQueue segmentQueue `state:"wait"` 499 500 // synRcvdCount is the number of connections for this endpoint that are 501 // in SYN-RCVD state; this is only accessed atomically. 502 synRcvdCount int32 503 504 // userMSS if non-zero is the MSS value explicitly set by the user 505 // for this endpoint using the TCP_MAXSEG setsockopt. 506 userMSS uint16 507 508 // maxSynRetries is the maximum number of SYN retransmits that TCP should 509 // send before aborting the attempt to connect. It cannot exceed 255. 510 // 511 // NOTE: This is currently a no-op and does not change the SYN 512 // retransmissions. 513 maxSynRetries uint8 514 515 // windowClamp is used to bound the size of the advertised window to 516 // this value. 517 windowClamp uint32 518 519 // sndQueueInfo contains the implementation of the endpoint's send queue. 520 sndQueueInfo sndQueueInfo 521 522 // cc stores the name of the Congestion Control algorithm to use for 523 // this endpoint. 524 cc tcpip.CongestionControlOption 525 526 // newSegmentWaker is used to indicate to the protocol goroutine that 527 // it needs to wake up and handle new segments queued to it. 528 newSegmentWaker sleep.Waker `state:"manual"` 529 530 // notificationWaker is used to indicate to the protocol goroutine that 531 // it needs to wake up and check for notifications. 532 notificationWaker sleep.Waker `state:"manual"` 533 534 // notifyFlags is a bitmask of flags used to indicate to the protocol 535 // goroutine what it was notified; this is only accessed atomically. 536 notifyFlags uint32 `state:"nosave"` 537 538 // keepalive manages TCP keepalive state. When the connection is idle 539 // (no data sent or received) for keepaliveIdle, we start sending 540 // keepalives every keepalive.interval. If we send keepalive.count 541 // without hearing a response, the connection is closed. 542 keepalive keepalive 543 544 // userTimeout if non-zero specifies a user specified timeout for 545 // a connection w/ pending data to send. A connection that has pending 546 // unacked data will be forcibily aborted if the timeout is reached 547 // without any data being acked. 548 userTimeout time.Duration 549 550 // deferAccept if non-zero specifies a user specified time during 551 // which the final ACK of a handshake will be dropped provided the 552 // ACK is a bare ACK and carries no data. If the timeout is crossed then 553 // the bare ACK is accepted and the connection is delivered to the 554 // listener. 555 deferAccept time.Duration 556 557 // pendingAccepted tracks connections queued to be accepted. It is used to 558 // ensure such queued connections are terminated before the accepted queue is 559 // marked closed (by setting its capacity to zero). 560 pendingAccepted sync.WaitGroup `state:"nosave"` 561 562 // acceptMu protects accepted. 563 acceptMu sync.Mutex `state:"nosave"` 564 565 // acceptCond is a condition variable that can be used to block on when 566 // accepted is full and an endpoint is ready to be delivered. 567 // 568 // We use this condition variable to block/unblock goroutines which 569 // tried to deliver an endpoint but couldn't because accept backlog was 570 // full ( See: endpoint.deliverAccepted ). 571 acceptCond *sync.Cond `state:"nosave"` 572 573 // accepted is used by a listening endpoint protocol goroutine to 574 // send newly accepted connections to the endpoint so that they can be 575 // read by Accept() calls. 576 accepted accepted 577 578 // The following are only used from the protocol goroutine, and 579 // therefore don't need locks to protect them. 580 rcv *receiver `state:"wait"` 581 snd *sender `state:"wait"` 582 583 // The goroutine drain completion notification channel. 584 drainDone chan struct{} `state:"nosave"` 585 586 // The goroutine undrain notification channel. This is currently used as 587 // a way to block the worker goroutines. Today nothing closes/writes 588 // this channel and this causes any goroutines waiting on this to just 589 // block. This is used during save/restore to prevent worker goroutines 590 // from mutating state as it's being saved. 591 undrain chan struct{} `state:"nosave"` 592 593 // probe if not nil is invoked on every received segment. It is passed 594 // a copy of the current state of the endpoint. 595 probe stack.TCPProbeFunc `state:"nosave"` 596 597 // The following are only used to assist the restore run to re-connect. 598 connectingAddress tcpip.Address 599 600 // amss is the advertised MSS to the peer by this endpoint. 601 amss uint16 602 603 // sendTOS represents IPv4 TOS or IPv6 TrafficClass, 604 // applied while sending packets. Defaults to 0 as on Linux. 605 sendTOS uint8 606 607 gso stack.GSO 608 609 // TODO(b/142022063): Add ability to save and restore per endpoint stats. 610 stats Stats `state:"nosave"` 611 612 // tcpLingerTimeout is the maximum amount of a time a socket 613 // a socket stays in TIME_WAIT state before being marked 614 // closed. 615 tcpLingerTimeout time.Duration 616 617 // closed indicates that the user has called closed on the 618 // endpoint and at this point the endpoint is only around 619 // to complete the TCP shutdown. 620 closed bool 621 622 // txHash is the transport layer hash to be set on outbound packets 623 // emitted by this endpoint. 624 txHash uint32 625 626 // owner is used to get uid and gid of the packet. 627 owner tcpip.PacketOwner 628 629 // ops is used to get socket level options. 630 ops tcpip.SocketOptions 631 632 // lastOutOfWindowAckTime is the time at which the an ACK was sent in response 633 // to an out of window segment being received by this endpoint. 634 lastOutOfWindowAckTime tcpip.MonotonicTime 635 } 636 637 // UniqueID implements stack.TransportEndpoint.UniqueID. 638 func (e *endpoint) UniqueID() uint64 { 639 return e.uniqueID 640 } 641 642 // calculateAdvertisedMSS calculates the MSS to advertise. 643 // 644 // If userMSS is non-zero and is not greater than the maximum possible MSS for 645 // r, it will be used; otherwise, the maximum possible MSS will be used. 646 func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 { 647 // The maximum possible MSS is dependent on the route. 648 // TODO(b/143359391): Respect TCP Min and Max size. 649 maxMSS := uint16(r.MTU() - header.TCPMinimumSize) 650 651 if userMSS != 0 && userMSS < maxMSS { 652 return userMSS 653 } 654 655 return maxMSS 656 } 657 658 // LockUser tries to lock e.mu and if it fails it will check if the lock is held 659 // by another syscall goroutine. If yes, then it will goto sleep waiting for the 660 // lock to be released, if not then it will spin till it acquires the lock or 661 // another syscall goroutine acquires it in which case it will goto sleep as 662 // described above. 663 // 664 // The assumption behind spinning here being that background packet processing 665 // should not be holding the lock for long and spinning reduces latency as we 666 // avoid an expensive sleep/wakeup of of the syscall goroutine). 667 // +checklocksacquire:e.mu 668 func (e *endpoint) LockUser() { 669 for { 670 // Try first if the sock is locked then check if it's owned 671 // by another user goroutine if not then we spin, otherwise 672 // we just go to sleep on the Lock() and wait. 673 if !e.mu.TryLock() { 674 // If socket is owned by the user then just go to sleep 675 // as the lock could be held for a reasonably long time. 676 if atomic.LoadUint32(&e.ownedByUser) == 1 { 677 e.mu.Lock() 678 atomic.StoreUint32(&e.ownedByUser, 1) 679 return 680 } 681 // Spin but yield the processor since the lower half 682 // should yield the lock soon. 683 runtime.Gosched() 684 continue 685 } 686 atomic.StoreUint32(&e.ownedByUser, 1) 687 return // +checklocksforce 688 } 689 } 690 691 // UnlockUser will check if there are any segments already queued for processing 692 // and process any such segments before unlocking e.mu. This is required because 693 // we when packets arrive and endpoint lock is already held then such packets 694 // are queued up to be processed. If the lock is held by the endpoint goroutine 695 // then it will process these packets but if the lock is instead held by the 696 // syscall goroutine then we can have the syscall goroutine process the backlog 697 // before unlocking. 698 // 699 // This avoids an unnecessary wakeup of the endpoint protocol goroutine for the 700 // endpoint. It's also required eventually when we get rid of the endpoint 701 // protocol goroutine altogether. 702 // 703 // Precondition: e.LockUser() must have been called before calling e.UnlockUser() 704 // +checklocksrelease:e.mu 705 func (e *endpoint) UnlockUser() { 706 // Lock segment queue before checking so that we avoid a race where 707 // segments can be queued between the time we check if queue is empty 708 // and actually unlock the endpoint mutex. 709 for { 710 e.segmentQueue.mu.Lock() 711 if e.segmentQueue.emptyLocked() { 712 if atomic.SwapUint32(&e.ownedByUser, 0) != 1 { 713 panic("e.UnlockUser() called without calling e.LockUser()") 714 } 715 e.mu.Unlock() 716 e.segmentQueue.mu.Unlock() 717 return 718 } 719 e.segmentQueue.mu.Unlock() 720 721 switch e.EndpointState() { 722 case StateEstablished: 723 if err := e.handleSegmentsLocked(true /* fastPath */); err != nil { 724 e.notifyProtocolGoroutine(notifyTickleWorker) 725 } 726 default: 727 // Since we are waking the endpoint goroutine here just unlock 728 // and let it process the queued segments. 729 e.newSegmentWaker.Assert() 730 if atomic.SwapUint32(&e.ownedByUser, 0) != 1 { 731 panic("e.UnlockUser() called without calling e.LockUser()") 732 } 733 e.mu.Unlock() 734 return 735 } 736 } 737 } 738 739 // StopWork halts packet processing. Only to be used in tests. 740 // +checklocksacquire:e.mu 741 func (e *endpoint) StopWork() { 742 e.mu.Lock() 743 } 744 745 // ResumeWork resumes packet processing. Only to be used in tests. 746 // +checklocksrelease:e.mu 747 func (e *endpoint) ResumeWork() { 748 e.mu.Unlock() 749 } 750 751 // setEndpointState updates the state of the endpoint to state atomically. This 752 // method is unexported as the only place we should update the state is in this 753 // package but we allow the state to be read freely without holding e.mu. 754 // 755 // Precondition: e.mu must be held to call this method. 756 func (e *endpoint) setEndpointState(state EndpointState) { 757 oldstate := EndpointState(atomic.LoadUint32(&e.state)) 758 switch state { 759 case StateEstablished: 760 e.stack.Stats().TCP.CurrentEstablished.Increment() 761 e.stack.Stats().TCP.CurrentConnected.Increment() 762 case StateError: 763 fallthrough 764 case StateClose: 765 if oldstate == StateCloseWait || oldstate == StateEstablished { 766 e.stack.Stats().TCP.EstablishedResets.Increment() 767 } 768 fallthrough 769 default: 770 if oldstate == StateEstablished { 771 e.stack.Stats().TCP.CurrentEstablished.Decrement() 772 } 773 } 774 atomic.StoreUint32(&e.state, uint32(state)) 775 } 776 777 // EndpointState returns the current state of the endpoint. 778 func (e *endpoint) EndpointState() EndpointState { 779 return EndpointState(atomic.LoadUint32(&e.state)) 780 } 781 782 // setRecentTimestamp sets the recentTS field to the provided value. 783 func (e *endpoint) setRecentTimestamp(recentTS uint32) { 784 e.RecentTS = recentTS 785 e.recentTSTime = e.stack.Clock().NowMonotonic() 786 } 787 788 // recentTimestamp returns the value of the recentTS field. 789 func (e *endpoint) recentTimestamp() uint32 { 790 return e.RecentTS 791 } 792 793 // keepalive is a synchronization wrapper used to appease stateify. See the 794 // comment in endpoint, where it is used. 795 // 796 // +stateify savable 797 type keepalive struct { 798 sync.Mutex `state:"nosave"` 799 idle time.Duration 800 interval time.Duration 801 count int 802 unacked int 803 timer timer `state:"nosave"` 804 waker sleep.Waker `state:"nosave"` 805 } 806 807 func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint { 808 e := &endpoint{ 809 stack: s, 810 TransportEndpointInfo: stack.TransportEndpointInfo{ 811 NetProto: netProto, 812 TransProto: header.TCPProtocolNumber, 813 }, 814 sndQueueInfo: sndQueueInfo{ 815 TCPSndBufState: stack.TCPSndBufState{ 816 SndMTU: math.MaxInt32, 817 }, 818 }, 819 waiterQueue: waiterQueue, 820 state: uint32(StateInitial), 821 keepalive: keepalive{ 822 // Linux defaults. 823 idle: 2 * time.Hour, 824 interval: 75 * time.Second, 825 count: 9, 826 }, 827 uniqueID: s.UniqueID(), 828 txHash: s.Rand().Uint32(), 829 windowClamp: DefaultReceiveBufferSize, 830 maxSynRetries: DefaultSynRetries, 831 } 832 e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits, GetTCPReceiveBufferLimits) 833 e.ops.SetMulticastLoop(true) 834 e.ops.SetQuickAck(true) 835 e.ops.SetSendBufferSize(DefaultSendBufferSize, false /* notify */) 836 e.ops.SetReceiveBufferSize(DefaultReceiveBufferSize, false /* notify */) 837 838 var ss tcpip.TCPSendBufferSizeRangeOption 839 if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil { 840 e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) 841 } 842 843 var rs tcpip.TCPReceiveBufferSizeRangeOption 844 if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil { 845 e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */) 846 } 847 848 var cs tcpip.CongestionControlOption 849 if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil { 850 e.cc = cs 851 } 852 853 var mrb tcpip.TCPModerateReceiveBufferOption 854 if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil { 855 e.rcvQueueInfo.RcvAutoParams.Disabled = !bool(mrb) 856 } 857 858 var de tcpip.TCPDelayEnabled 859 if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de { 860 e.ops.SetDelayOption(true) 861 } 862 863 var tcpLT tcpip.TCPLingerTimeoutOption 864 if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil { 865 e.tcpLingerTimeout = time.Duration(tcpLT) 866 } 867 868 var synRetries tcpip.TCPSynRetriesOption 869 if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil { 870 e.maxSynRetries = uint8(synRetries) 871 } 872 873 s.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery) 874 875 if p := s.GetTCPProbe(); p != nil { 876 e.probe = p 877 } 878 879 e.segmentQueue.ep = e 880 e.TSOffset = timeStampOffset(e.stack.Rand()) 881 e.acceptCond = sync.NewCond(&e.acceptMu) 882 e.keepalive.timer.init(e.stack.Clock(), &e.keepalive.waker) 883 884 return e 885 } 886 887 // Readiness returns the current readiness of the endpoint. For example, if 888 // waiter.EventIn is set, the endpoint is immediately readable. 889 func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { 890 result := waiter.EventMask(0) 891 892 switch e.EndpointState() { 893 case StateInitial, StateBound: 894 // This prevents blocking of new sockets which are not 895 // connected when SO_LINGER is set. 896 result |= waiter.EventHUp 897 898 case StateConnecting, StateSynSent, StateSynRecv: 899 // Ready for nothing. 900 901 case StateClose, StateError, StateTimeWait: 902 // Ready for anything. 903 result = mask 904 905 case StateListen: 906 // Check if there's anything in the accepted queue. 907 if (mask & waiter.ReadableEvents) != 0 { 908 e.acceptMu.Lock() 909 if e.accepted.endpoints.Len() != 0 { 910 result |= waiter.ReadableEvents 911 } 912 e.acceptMu.Unlock() 913 } 914 } 915 if e.EndpointState().connected() { 916 // Determine if the endpoint is writable if requested. 917 if (mask & waiter.WritableEvents) != 0 { 918 e.sndQueueInfo.sndQueueMu.Lock() 919 sndBufSize := e.getSendBufferSize() 920 if e.sndQueueInfo.SndClosed || e.sndQueueInfo.SndBufUsed < sndBufSize { 921 result |= waiter.WritableEvents 922 } 923 e.sndQueueInfo.sndQueueMu.Unlock() 924 } 925 926 // Determine if the endpoint is readable if requested. 927 if (mask & waiter.ReadableEvents) != 0 { 928 e.rcvQueueInfo.rcvQueueMu.Lock() 929 if e.rcvQueueInfo.RcvBufUsed > 0 || e.rcvQueueInfo.RcvClosed { 930 result |= waiter.ReadableEvents 931 } 932 e.rcvQueueInfo.rcvQueueMu.Unlock() 933 } 934 } 935 936 return result 937 } 938 939 func (e *endpoint) fetchNotifications() uint32 { 940 return atomic.SwapUint32(&e.notifyFlags, 0) 941 } 942 943 func (e *endpoint) notifyProtocolGoroutine(n uint32) { 944 for { 945 v := atomic.LoadUint32(&e.notifyFlags) 946 if v&n == n { 947 // The flags are already set. 948 return 949 } 950 951 if atomic.CompareAndSwapUint32(&e.notifyFlags, v, v|n) { 952 if v == 0 { 953 // We are causing a transition from no flags to 954 // at least one flag set, so we must cause the 955 // protocol goroutine to wake up. 956 e.notificationWaker.Assert() 957 } 958 return 959 } 960 } 961 } 962 963 // Abort implements stack.TransportEndpoint.Abort. 964 func (e *endpoint) Abort() { 965 // The abort notification is not processed synchronously, so no 966 // synchronization is needed. 967 // 968 // If the endpoint becomes connected after this check, we still close 969 // the endpoint. This worst case results in a slower abort. 970 // 971 // If the endpoint disconnected after the check, nothing needs to be 972 // done, so sending a notification which will potentially be ignored is 973 // fine. 974 // 975 // If the endpoint connecting finishes after the check, the endpoint 976 // is either in a connected state (where we would notifyAbort anyway), 977 // SYN-RECV (where we would also notifyAbort anyway), or in an error 978 // state where nothing is required and the notification can be safely 979 // ignored. 980 // 981 // Endpoints where a Close during connecting or SYN-RECV state would be 982 // problematic are set to state connecting before being registered (and 983 // thus possible to be Aborted). They are never available in initial 984 // state. 985 // 986 // Endpoints transitioning from initial to connecting state may be 987 // safely either closed or sent notifyAbort. 988 if s := e.EndpointState(); s == StateConnecting || s == StateSynRecv || s.connected() { 989 e.notifyProtocolGoroutine(notifyAbort) 990 return 991 } 992 e.Close() 993 } 994 995 // Close puts the endpoint in a closed state and frees all resources associated 996 // with it. It must be called only once and with no other concurrent calls to 997 // the endpoint. 998 func (e *endpoint) Close() { 999 e.LockUser() 1000 defer e.UnlockUser() 1001 if e.closed { 1002 return 1003 } 1004 1005 linger := e.SocketOptions().GetLinger() 1006 if linger.Enabled && linger.Timeout == 0 { 1007 s := e.EndpointState() 1008 isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv 1009 if isResetState { 1010 // Close the endpoint without doing full shutdown and 1011 // send a RST. 1012 e.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) 1013 e.closeNoShutdownLocked() 1014 1015 // Wake up worker to close the endpoint. 1016 switch s { 1017 case StateSynRecv: 1018 e.notifyProtocolGoroutine(notifyClose) 1019 default: 1020 e.notifyProtocolGoroutine(notifyTickleWorker) 1021 } 1022 return 1023 } 1024 } 1025 1026 // Issue a shutdown so that the peer knows we won't send any more data 1027 // if we're connected, or stop accepting if we're listening. 1028 e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead) 1029 e.closeNoShutdownLocked() 1030 } 1031 1032 // closeNoShutdown closes the endpoint without doing a full shutdown. 1033 func (e *endpoint) closeNoShutdownLocked() { 1034 // For listening sockets, we always release ports inline so that they 1035 // are immediately available for reuse after Close() is called. If also 1036 // registered, we unregister as well otherwise the next user would fail 1037 // in Listen() when trying to register. 1038 if e.EndpointState() == StateListen && e.isPortReserved { 1039 if e.isRegistered { 1040 e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 1041 e.isRegistered = false 1042 } 1043 1044 portRes := ports.Reservation{ 1045 Networks: e.effectiveNetProtos, 1046 Transport: ProtocolNumber, 1047 Addr: e.TransportEndpointInfo.ID.LocalAddress, 1048 Port: e.TransportEndpointInfo.ID.LocalPort, 1049 Flags: e.boundPortFlags, 1050 BindToDevice: e.boundBindToDevice, 1051 Dest: e.boundDest, 1052 } 1053 e.stack.ReleasePort(portRes) 1054 e.isPortReserved = false 1055 e.boundBindToDevice = 0 1056 e.boundPortFlags = ports.Flags{} 1057 e.boundDest = tcpip.FullAddress{} 1058 } 1059 1060 // Mark endpoint as closed. 1061 e.closed = true 1062 1063 switch e.EndpointState() { 1064 case StateClose, StateError: 1065 return 1066 } 1067 1068 eventMask := waiter.ReadableEvents | waiter.WritableEvents 1069 // Either perform the local cleanup or kick the worker to make sure it 1070 // knows it needs to cleanup. 1071 if e.workerRunning { 1072 e.workerCleanup = true 1073 tcpip.AddDanglingEndpoint(e) 1074 // Worker will remove the dangling endpoint when the endpoint 1075 // goroutine terminates. 1076 e.notifyProtocolGoroutine(notifyClose) 1077 } else { 1078 e.transitionToStateCloseLocked() 1079 // Notify that the endpoint is closed. 1080 eventMask |= waiter.EventHUp 1081 } 1082 1083 // The TCP closing state-machine would eventually notify EventHUp, but we 1084 // notify EventIn|EventOut immediately to unblock any blocked waiters. 1085 e.waiterQueue.Notify(eventMask) 1086 } 1087 1088 // closePendingAcceptableConnections closes all connections that have completed 1089 // handshake but not yet been delivered to the application. 1090 func (e *endpoint) closePendingAcceptableConnectionsLocked() { 1091 e.acceptMu.Lock() 1092 acceptedCopy := e.accepted 1093 e.accepted = accepted{} 1094 e.acceptMu.Unlock() 1095 1096 if acceptedCopy == (accepted{}) { 1097 return 1098 } 1099 1100 e.acceptCond.Broadcast() 1101 1102 // Reset all connections that are waiting to be accepted. 1103 for n := acceptedCopy.endpoints.Front(); n != nil; n = n.Next() { 1104 n.Value.(*endpoint).notifyProtocolGoroutine(notifyReset) 1105 } 1106 // Wait for reset of all endpoints that are still waiting to be delivered to 1107 // the now closed accepted. 1108 e.pendingAccepted.Wait() 1109 } 1110 1111 // cleanupLocked frees all resources associated with the endpoint. It is called 1112 // after Close() is called and the worker goroutine (if any) is done with its 1113 // work. 1114 func (e *endpoint) cleanupLocked() { 1115 // Close all endpoints that might have been accepted by TCP but not by 1116 // the client. 1117 e.closePendingAcceptableConnectionsLocked() 1118 e.keepalive.timer.cleanup() 1119 1120 e.workerCleanup = false 1121 1122 if e.isRegistered { 1123 e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 1124 e.isRegistered = false 1125 } 1126 1127 if e.isPortReserved { 1128 portRes := ports.Reservation{ 1129 Networks: e.effectiveNetProtos, 1130 Transport: ProtocolNumber, 1131 Addr: e.TransportEndpointInfo.ID.LocalAddress, 1132 Port: e.TransportEndpointInfo.ID.LocalPort, 1133 Flags: e.boundPortFlags, 1134 BindToDevice: e.boundBindToDevice, 1135 Dest: e.boundDest, 1136 } 1137 e.stack.ReleasePort(portRes) 1138 e.isPortReserved = false 1139 } 1140 e.boundBindToDevice = 0 1141 e.boundPortFlags = ports.Flags{} 1142 e.boundDest = tcpip.FullAddress{} 1143 1144 if e.route != nil { 1145 e.route.Release() 1146 e.route = nil 1147 } 1148 1149 e.stack.CompleteTransportEndpointCleanup(e) 1150 tcpip.DeleteDanglingEndpoint(e) 1151 } 1152 1153 // wndFromSpace returns the window that we can advertise based on the available 1154 // receive buffer space. 1155 func wndFromSpace(space int) int { 1156 return space >> rcvAdvWndScale 1157 } 1158 1159 // initialReceiveWindow returns the initial receive window to advertise in the 1160 // SYN/SYN-ACK. 1161 func (e *endpoint) initialReceiveWindow() int { 1162 rcvWnd := wndFromSpace(e.receiveBufferAvailable()) 1163 if rcvWnd > math.MaxUint16 { 1164 rcvWnd = math.MaxUint16 1165 } 1166 1167 // Use the user supplied MSS, if available. 1168 routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2 1169 if rcvWnd > routeWnd { 1170 rcvWnd = routeWnd 1171 } 1172 rcvWndScale := e.rcvWndScaleForHandshake() 1173 1174 // Round-down the rcvWnd to a multiple of wndScale. This ensures that the 1175 // window offered in SYN won't be reduced due to the loss of precision if 1176 // window scaling is enabled after the handshake. 1177 rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale) 1178 1179 // Ensure we can always accept at least 1 byte if the scale specified 1180 // was too high for the provided rcvWnd. 1181 if rcvWnd == 0 { 1182 rcvWnd = 1 1183 } 1184 1185 return rcvWnd 1186 } 1187 1188 // ModerateRecvBuf adjusts the receive buffer and the advertised window 1189 // based on the number of bytes copied to userspace. 1190 func (e *endpoint) ModerateRecvBuf(copied int) { 1191 e.LockUser() 1192 defer e.UnlockUser() 1193 1194 e.rcvQueueInfo.rcvQueueMu.Lock() 1195 if e.rcvQueueInfo.RcvAutoParams.Disabled { 1196 e.rcvQueueInfo.rcvQueueMu.Unlock() 1197 return 1198 } 1199 now := e.stack.Clock().NowMonotonic() 1200 if rtt := e.rcvQueueInfo.RcvAutoParams.RTT; rtt == 0 || now.Sub(e.rcvQueueInfo.RcvAutoParams.MeasureTime) < rtt { 1201 e.rcvQueueInfo.RcvAutoParams.CopiedBytes += copied 1202 e.rcvQueueInfo.rcvQueueMu.Unlock() 1203 return 1204 } 1205 prevRTTCopied := e.rcvQueueInfo.RcvAutoParams.CopiedBytes + copied 1206 prevCopied := e.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes 1207 rcvWnd := 0 1208 if prevRTTCopied > prevCopied { 1209 // The minimal receive window based on what was copied by the app 1210 // in the immediate preceding RTT and some extra buffer for 16 1211 // segments to account for variations. 1212 // We multiply by 2 to account for packet losses. 1213 rcvWnd = prevRTTCopied*2 + 16*int(e.amss) 1214 1215 // Scale for slow start based on bytes copied in this RTT vs previous. 1216 grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied 1217 1218 // Multiply growth factor by 2 again to account for sender being 1219 // in slow-start where the sender grows it's congestion window 1220 // by 100% per RTT. 1221 rcvWnd += grow * 2 1222 1223 // Make sure auto tuned buffer size can always receive upto 2x 1224 // the initial window of 10 segments. 1225 if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd { 1226 rcvWnd = minRcvWnd 1227 } 1228 1229 // Cap the auto tuned buffer size by the maximum permissible 1230 // receive buffer size. 1231 if max := e.maxReceiveBufferSize(); rcvWnd > max { 1232 rcvWnd = max 1233 } 1234 1235 // We do not adjust downwards as that can cause the receiver to 1236 // reject valid data that might already be in flight as the 1237 // acceptable window will shrink. 1238 rcvBufSize := int(e.ops.GetReceiveBufferSize()) 1239 if rcvWnd > rcvBufSize { 1240 availBefore := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize)) 1241 e.ops.SetReceiveBufferSize(int64(rcvWnd), false /* notify */) 1242 availAfter := wndFromSpace(e.receiveBufferAvailableLocked(rcvWnd)) 1243 if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, rcvBufSize); crossed && above { 1244 e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow) 1245 } 1246 } 1247 1248 // We only update PrevCopiedBytes when we grow the buffer because in cases 1249 // where PrevCopiedBytes > prevRTTCopied the existing buffer is already big 1250 // enough to handle the current rate and we don't need to do any 1251 // adjustments. 1252 e.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes = prevRTTCopied 1253 } 1254 e.rcvQueueInfo.RcvAutoParams.MeasureTime = now 1255 e.rcvQueueInfo.RcvAutoParams.CopiedBytes = 0 1256 e.rcvQueueInfo.rcvQueueMu.Unlock() 1257 } 1258 1259 // SetOwner implements tcpip.Endpoint.SetOwner. 1260 func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { 1261 e.owner = owner 1262 } 1263 1264 // Preconditions: e.mu must be held to call this function. 1265 func (e *endpoint) hardErrorLocked() tcpip.Error { 1266 err := e.hardError 1267 e.hardError = nil 1268 return err 1269 } 1270 1271 // Preconditions: e.mu must be held to call this function. 1272 func (e *endpoint) lastErrorLocked() tcpip.Error { 1273 e.lastErrorMu.Lock() 1274 defer e.lastErrorMu.Unlock() 1275 err := e.lastError 1276 e.lastError = nil 1277 return err 1278 } 1279 1280 // LastError implements tcpip.Endpoint.LastError. 1281 func (e *endpoint) LastError() tcpip.Error { 1282 e.LockUser() 1283 defer e.UnlockUser() 1284 if err := e.hardErrorLocked(); err != nil { 1285 return err 1286 } 1287 return e.lastErrorLocked() 1288 } 1289 1290 // LastErrorLocked reads and clears lastError with e.mu held. 1291 // Only to be used in tests. 1292 func (e *endpoint) LastErrorLocked() tcpip.Error { 1293 return e.lastErrorLocked() 1294 } 1295 1296 // UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError. 1297 func (e *endpoint) UpdateLastError(err tcpip.Error) { 1298 e.LockUser() 1299 e.lastErrorMu.Lock() 1300 e.lastError = err 1301 e.lastErrorMu.Unlock() 1302 e.UnlockUser() 1303 } 1304 1305 // Read implements tcpip.Endpoint.Read. 1306 func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) { 1307 e.rcvReadMu.Lock() 1308 defer e.rcvReadMu.Unlock() 1309 1310 // N.B. Here we get a range of segments to be processed. It is safe to not 1311 // hold rcvQueueMu when processing, since we hold rcvReadMu to ensure only we 1312 // can remove segments from the list through commitRead(). 1313 first, last, serr := e.startRead() 1314 if serr != nil { 1315 if _, ok := serr.(*tcpip.ErrClosedForReceive); ok { 1316 e.stats.ReadErrors.ReadClosed.Increment() 1317 } 1318 return tcpip.ReadResult{}, serr 1319 } 1320 1321 var err error 1322 done := 0 1323 s := first 1324 for s != nil { 1325 var n int 1326 n, err = s.data.ReadTo(dst, opts.Peek) 1327 // Book keeping first then error handling. 1328 1329 done += n 1330 1331 if opts.Peek { 1332 // For peek, we use the (first, last) range of segment returned from 1333 // startRead. We don't consume the receive buffer, so commitRead should 1334 // not be called. 1335 // 1336 // N.B. It is important to use `last` to determine the last segment, since 1337 // appending can happen while we process, and will lead to data race. 1338 if s == last { 1339 break 1340 } 1341 s = s.Next() 1342 } else { 1343 // N.B. commitRead() conveniently returns the next segment to read, after 1344 // removing the data/segment that is read. 1345 s = e.commitRead(n) 1346 } 1347 1348 if err != nil { 1349 break 1350 } 1351 } 1352 1353 // If something is read, we must report it. Report error when nothing is read. 1354 if done == 0 && err != nil { 1355 return tcpip.ReadResult{}, &tcpip.ErrBadBuffer{} 1356 } 1357 return tcpip.ReadResult{ 1358 Count: done, 1359 Total: done, 1360 }, nil 1361 } 1362 1363 // startRead checks that endpoint is in a readable state, and return the 1364 // inclusive range of segments that can be read. 1365 // 1366 // Precondition: e.rcvReadMu must be held. 1367 func (e *endpoint) startRead() (first, last *segment, err tcpip.Error) { 1368 e.LockUser() 1369 defer e.UnlockUser() 1370 1371 // When in SYN-SENT state, let the caller block on the receive. 1372 // An application can initiate a non-blocking connect and then block 1373 // on a receive. It can expect to read any data after the handshake 1374 // is complete. RFC793, section 3.9, p58. 1375 if e.EndpointState() == StateSynSent { 1376 return nil, nil, &tcpip.ErrWouldBlock{} 1377 } 1378 1379 // The endpoint can be read if it's connected, or if it's already closed 1380 // but has some pending unread data. Also note that a RST being received 1381 // would cause the state to become StateError so we should allow the 1382 // reads to proceed before returning a ECONNRESET. 1383 e.rcvQueueInfo.rcvQueueMu.Lock() 1384 defer e.rcvQueueInfo.rcvQueueMu.Unlock() 1385 1386 bufUsed := e.rcvQueueInfo.RcvBufUsed 1387 if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 { 1388 if s == StateError { 1389 if err := e.hardErrorLocked(); err != nil { 1390 return nil, nil, err 1391 } 1392 return nil, nil, &tcpip.ErrClosedForReceive{} 1393 } 1394 e.stats.ReadErrors.NotConnected.Increment() 1395 return nil, nil, &tcpip.ErrNotConnected{} 1396 } 1397 1398 if e.rcvQueueInfo.RcvBufUsed == 0 { 1399 if e.rcvQueueInfo.RcvClosed || !e.EndpointState().connected() { 1400 return nil, nil, &tcpip.ErrClosedForReceive{} 1401 } 1402 return nil, nil, &tcpip.ErrWouldBlock{} 1403 } 1404 1405 return e.rcvQueueInfo.rcvQueue.Front(), e.rcvQueueInfo.rcvQueue.Back(), nil 1406 } 1407 1408 // commitRead commits a read of done bytes and returns the next non-empty 1409 // segment to read. Data read from the segment must have also been removed from 1410 // the segment in order for this method to work correctly. 1411 // 1412 // It is performance critical to call commitRead frequently when servicing a big 1413 // Read request, so TCP can make progress timely. Right now, it is designed to 1414 // do this per segment read, hence this method conveniently returns the next 1415 // segment to read while holding the lock. 1416 // 1417 // Precondition: e.rcvReadMu must be held. 1418 func (e *endpoint) commitRead(done int) *segment { 1419 e.LockUser() 1420 defer e.UnlockUser() 1421 e.rcvQueueInfo.rcvQueueMu.Lock() 1422 defer e.rcvQueueInfo.rcvQueueMu.Unlock() 1423 1424 memDelta := 0 1425 s := e.rcvQueueInfo.rcvQueue.Front() 1426 for s != nil && s.data.Size() == 0 { 1427 e.rcvQueueInfo.rcvQueue.Remove(s) 1428 // Memory is only considered released when the whole segment has been 1429 // read. 1430 memDelta += s.segMemSize() 1431 s.decRef() 1432 s = e.rcvQueueInfo.rcvQueue.Front() 1433 } 1434 e.rcvQueueInfo.RcvBufUsed -= done 1435 1436 if memDelta > 0 { 1437 // If the window was small before this read and if the read freed up 1438 // enough buffer space, to either fit an aMSS or half a receive buffer 1439 // (whichever smaller), then notify the protocol goroutine to send a 1440 // window update. 1441 if crossed, above := e.windowCrossedACKThresholdLocked(memDelta, int(e.ops.GetReceiveBufferSize())); crossed && above { 1442 e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow) 1443 } 1444 } 1445 1446 return e.rcvQueueInfo.rcvQueue.Front() 1447 } 1448 1449 // isEndpointWritableLocked checks if a given endpoint is writable 1450 // and also returns the number of bytes that can be written at this 1451 // moment. If the endpoint is not writable then it returns an error 1452 // indicating the reason why it's not writable. 1453 // Caller must hold e.mu and e.sndQueueMu 1454 func (e *endpoint) isEndpointWritableLocked() (int, tcpip.Error) { 1455 // The endpoint cannot be written to if it's not connected. 1456 switch s := e.EndpointState(); { 1457 case s == StateError: 1458 if err := e.hardErrorLocked(); err != nil { 1459 return 0, err 1460 } 1461 return 0, &tcpip.ErrClosedForSend{} 1462 case !s.connecting() && !s.connected(): 1463 return 0, &tcpip.ErrClosedForSend{} 1464 case s.connecting(): 1465 // As per RFC793, page 56, a send request arriving when in connecting 1466 // state, can be queued to be completed after the state becomes 1467 // connected. Return an error code for the caller of endpoint Write to 1468 // try again, until the connection handshake is complete. 1469 return 0, &tcpip.ErrWouldBlock{} 1470 } 1471 1472 // Check if the connection has already been closed for sends. 1473 if e.sndQueueInfo.SndClosed { 1474 return 0, &tcpip.ErrClosedForSend{} 1475 } 1476 1477 sndBufSize := e.getSendBufferSize() 1478 avail := sndBufSize - e.sndQueueInfo.SndBufUsed 1479 if avail <= 0 { 1480 return 0, &tcpip.ErrWouldBlock{} 1481 } 1482 return avail, nil 1483 } 1484 1485 // readFromPayloader reads a slice from the Payloader. 1486 // +checklocks:e.mu 1487 // +checklocks:e.sndQueueInfo.sndQueueMu 1488 func (e *endpoint) readFromPayloader(p tcpip.Payloader, opts tcpip.WriteOptions, avail int) ([]byte, tcpip.Error) { 1489 // We can release locks while copying data. 1490 // 1491 // This is not possible if atomic is set, because we can't allow the 1492 // available buffer space to be consumed by some other caller while we 1493 // are copying data in. 1494 if !opts.Atomic { 1495 e.sndQueueInfo.sndQueueMu.Unlock() 1496 defer e.sndQueueInfo.sndQueueMu.Lock() 1497 1498 e.UnlockUser() 1499 defer e.LockUser() 1500 } 1501 1502 // Fetch data. 1503 if l := p.Len(); l < avail { 1504 avail = l 1505 } 1506 if avail == 0 { 1507 return nil, nil 1508 } 1509 v := make([]byte, avail) 1510 n, err := p.Read(v) 1511 if err != nil && err != io.EOF { 1512 return nil, &tcpip.ErrBadBuffer{} 1513 } 1514 return v[:n], nil 1515 } 1516 1517 // queueSegment reads data from the payloader and returns a segment to be sent. 1518 // +checklocks:e.mu 1519 func (e *endpoint) queueSegment(p tcpip.Payloader, opts tcpip.WriteOptions) (*segment, int, tcpip.Error) { 1520 e.sndQueueInfo.sndQueueMu.Lock() 1521 defer e.sndQueueInfo.sndQueueMu.Unlock() 1522 1523 avail, err := e.isEndpointWritableLocked() 1524 if err != nil { 1525 e.stats.WriteErrors.WriteClosed.Increment() 1526 return nil, 0, err 1527 } 1528 1529 v, err := e.readFromPayloader(p, opts, avail) 1530 if err != nil { 1531 return nil, 0, err 1532 } 1533 if !opts.Atomic { 1534 // Since we released locks in between it's possible that the 1535 // endpoint transitioned to a CLOSED/ERROR states so make 1536 // sure endpoint is still writable before trying to write. 1537 avail, err := e.isEndpointWritableLocked() 1538 if err != nil { 1539 e.stats.WriteErrors.WriteClosed.Increment() 1540 return nil, 0, err 1541 } 1542 1543 // Discard any excess data copied in due to avail being reduced due 1544 // to a simultaneous write call to the socket. 1545 if avail < len(v) { 1546 v = v[:avail] 1547 } 1548 } 1549 1550 // Add data to the send queue. 1551 s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), v) 1552 e.sndQueueInfo.SndBufUsed += len(v) 1553 e.snd.writeList.PushBack(s) 1554 1555 return s, len(v), nil 1556 } 1557 1558 // Write writes data to the endpoint's peer. 1559 func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { 1560 // Linux completely ignores any address passed to sendto(2) for TCP sockets 1561 // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More 1562 // and opts.EndOfRecord are also ignored. 1563 1564 e.LockUser() 1565 defer e.UnlockUser() 1566 1567 // Return if either we didn't queue anything or if an error occurred while 1568 // attempting to queue data. 1569 nextSeg, n, err := e.queueSegment(p, opts) 1570 if n == 0 || err != nil { 1571 return 0, err 1572 } 1573 1574 e.sendData(nextSeg) 1575 return int64(n), nil 1576 } 1577 1578 // selectWindowLocked returns the new window without checking for shrinking or scaling 1579 // applied. 1580 // Precondition: e.mu and e.rcvQueueMu must be held. 1581 func (e *endpoint) selectWindowLocked(rcvBufSize int) (wnd seqnum.Size) { 1582 wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize)) 1583 maxWindow := wndFromSpace(rcvBufSize) 1584 wndFromUsedBytes := maxWindow - e.rcvQueueInfo.RcvBufUsed 1585 1586 // We take the lesser of the wndFromAvailable and wndFromUsedBytes because in 1587 // cases where we receive a lot of small segments the segment overhead is a 1588 // lot higher and we can run out socket buffer space before we can fill the 1589 // previous window we advertised. In cases where we receive MSS sized or close 1590 // MSS sized segments we will probably run out of window space before we 1591 // exhaust receive buffer. 1592 newWnd := wndFromAvailable 1593 if newWnd > wndFromUsedBytes { 1594 newWnd = wndFromUsedBytes 1595 } 1596 if newWnd < 0 { 1597 newWnd = 0 1598 } 1599 return seqnum.Size(newWnd) 1600 } 1601 1602 // selectWindow invokes selectWindowLocked after acquiring e.rcvQueueMu. 1603 func (e *endpoint) selectWindow() (wnd seqnum.Size) { 1604 e.rcvQueueInfo.rcvQueueMu.Lock() 1605 wnd = e.selectWindowLocked(int(e.ops.GetReceiveBufferSize())) 1606 e.rcvQueueInfo.rcvQueueMu.Unlock() 1607 return wnd 1608 } 1609 1610 // windowCrossedACKThresholdLocked checks if the receive window to be announced 1611 // would be under aMSS or under the window derived from half receive buffer, 1612 // whichever smaller. This is useful as a receive side silly window syndrome 1613 // prevention mechanism. If window grows to reasonable value, we should send ACK 1614 // to the sender to inform the rx space is now large. We also want ensure a 1615 // series of small read()'s won't trigger a flood of spurious tiny ACK's. 1616 // 1617 // For large receive buffers, the threshold is aMSS - once reader reads more 1618 // than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of 1619 // receive buffer size. This is chosen arbitrarily. 1620 // crossed will be true if the window size crossed the ACK threshold. 1621 // above will be true if the new window is >= ACK threshold and false 1622 // otherwise. 1623 // 1624 // Precondition: e.mu and e.rcvQueueMu must be held. 1625 func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int, rcvBufSize int) (crossed bool, above bool) { 1626 newAvail := int(e.selectWindowLocked(rcvBufSize)) 1627 oldAvail := newAvail - deltaBefore 1628 if oldAvail < 0 { 1629 oldAvail = 0 1630 } 1631 threshold := int(e.amss) 1632 // rcvBufFraction is the inverse of the fraction of receive buffer size that 1633 // is used to decide if the available buffer space is now above it. 1634 const rcvBufFraction = 2 1635 if wndThreshold := wndFromSpace(rcvBufSize / rcvBufFraction); threshold > wndThreshold { 1636 threshold = wndThreshold 1637 } 1638 switch { 1639 case oldAvail < threshold && newAvail >= threshold: 1640 return true, true 1641 case oldAvail >= threshold && newAvail < threshold: 1642 return true, false 1643 } 1644 return false, false 1645 } 1646 1647 // OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet. 1648 func (e *endpoint) OnReuseAddressSet(v bool) { 1649 e.LockUser() 1650 e.portFlags.TupleOnly = v 1651 e.UnlockUser() 1652 } 1653 1654 // OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet. 1655 func (e *endpoint) OnReusePortSet(v bool) { 1656 e.LockUser() 1657 e.portFlags.LoadBalanced = v 1658 e.UnlockUser() 1659 } 1660 1661 // OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet. 1662 func (e *endpoint) OnKeepAliveSet(bool) { 1663 e.notifyProtocolGoroutine(notifyKeepaliveChanged) 1664 } 1665 1666 // OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet. 1667 func (e *endpoint) OnDelayOptionSet(v bool) { 1668 if !v { 1669 // Handle delayed data. 1670 e.sndQueueInfo.sndWaker.Assert() 1671 } 1672 } 1673 1674 // OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet. 1675 func (e *endpoint) OnCorkOptionSet(v bool) { 1676 if !v { 1677 // Handle the corked data. 1678 e.sndQueueInfo.sndWaker.Assert() 1679 } 1680 } 1681 1682 func (e *endpoint) getSendBufferSize() int { 1683 return int(e.ops.GetSendBufferSize()) 1684 } 1685 1686 // OnSetReceiveBufferSize implements tcpip.SocketOptionsHandler.OnSetReceiveBufferSize. 1687 func (e *endpoint) OnSetReceiveBufferSize(rcvBufSz, oldSz int64) (newSz int64) { 1688 e.LockUser() 1689 e.rcvQueueInfo.rcvQueueMu.Lock() 1690 1691 // Make sure the receive buffer size allows us to send a 1692 // non-zero window size. 1693 scale := uint8(0) 1694 if e.rcv != nil { 1695 scale = e.rcv.RcvWndScale 1696 } 1697 if rcvBufSz>>scale == 0 { 1698 rcvBufSz = 1 << scale 1699 } 1700 1701 availBefore := wndFromSpace(e.receiveBufferAvailableLocked(int(oldSz))) 1702 availAfter := wndFromSpace(e.receiveBufferAvailableLocked(int(rcvBufSz))) 1703 e.rcvQueueInfo.RcvAutoParams.Disabled = true 1704 1705 // Immediately send an ACK to uncork the sender silly window 1706 // syndrome prevetion, when our available space grows above aMSS 1707 // or half receive buffer, whichever smaller. 1708 if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, int(rcvBufSz)); crossed && above { 1709 e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow) 1710 } 1711 1712 e.rcvQueueInfo.rcvQueueMu.Unlock() 1713 e.UnlockUser() 1714 return rcvBufSz 1715 } 1716 1717 // SetSockOptInt sets a socket option. 1718 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { 1719 // Lower 2 bits represents ECN bits. RFC 3168, section 23.1 1720 const inetECNMask = 3 1721 1722 switch opt { 1723 case tcpip.KeepaliveCountOption: 1724 e.keepalive.Lock() 1725 e.keepalive.count = v 1726 e.keepalive.Unlock() 1727 e.notifyProtocolGoroutine(notifyKeepaliveChanged) 1728 1729 case tcpip.IPv4TOSOption: 1730 e.LockUser() 1731 // TODO(github.com/SagerNet/issue/995): ECN is not currently supported, 1732 // ignore the bits for now. 1733 e.sendTOS = uint8(v) & ^uint8(inetECNMask) 1734 e.UnlockUser() 1735 1736 case tcpip.IPv6TrafficClassOption: 1737 e.LockUser() 1738 // TODO(github.com/SagerNet/issue/995): ECN is not currently supported, 1739 // ignore the bits for now. 1740 e.sendTOS = uint8(v) & ^uint8(inetECNMask) 1741 e.UnlockUser() 1742 1743 case tcpip.MaxSegOption: 1744 userMSS := v 1745 if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS { 1746 return &tcpip.ErrInvalidOptionValue{} 1747 } 1748 e.LockUser() 1749 e.userMSS = uint16(userMSS) 1750 e.UnlockUser() 1751 e.notifyProtocolGoroutine(notifyMSSChanged) 1752 1753 case tcpip.MTUDiscoverOption: 1754 // Return not supported if attempting to set this option to 1755 // anything other than path MTU discovery disabled. 1756 if v != tcpip.PMTUDiscoveryDont { 1757 return &tcpip.ErrNotSupported{} 1758 } 1759 1760 case tcpip.TTLOption: 1761 e.LockUser() 1762 e.ttl = uint8(v) 1763 e.UnlockUser() 1764 1765 case tcpip.TCPSynCountOption: 1766 if v < 1 || v > 255 { 1767 return &tcpip.ErrInvalidOptionValue{} 1768 } 1769 e.LockUser() 1770 e.maxSynRetries = uint8(v) 1771 e.UnlockUser() 1772 1773 case tcpip.TCPWindowClampOption: 1774 if v == 0 { 1775 e.LockUser() 1776 switch e.EndpointState() { 1777 case StateClose, StateInitial: 1778 e.windowClamp = 0 1779 e.UnlockUser() 1780 return nil 1781 default: 1782 e.UnlockUser() 1783 return &tcpip.ErrInvalidOptionValue{} 1784 } 1785 } 1786 var rs tcpip.TCPReceiveBufferSizeRangeOption 1787 if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil { 1788 if v < rs.Min/2 { 1789 v = rs.Min / 2 1790 } 1791 } 1792 e.LockUser() 1793 e.windowClamp = uint32(v) 1794 e.UnlockUser() 1795 } 1796 return nil 1797 } 1798 1799 func (e *endpoint) HasNIC(id int32) bool { 1800 return id == 0 || e.stack.HasNIC(tcpip.NICID(id)) 1801 } 1802 1803 // SetSockOpt sets a socket option. 1804 func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { 1805 switch v := opt.(type) { 1806 case *tcpip.KeepaliveIdleOption: 1807 e.keepalive.Lock() 1808 e.keepalive.idle = time.Duration(*v) 1809 e.keepalive.Unlock() 1810 e.notifyProtocolGoroutine(notifyKeepaliveChanged) 1811 1812 case *tcpip.KeepaliveIntervalOption: 1813 e.keepalive.Lock() 1814 e.keepalive.interval = time.Duration(*v) 1815 e.keepalive.Unlock() 1816 e.notifyProtocolGoroutine(notifyKeepaliveChanged) 1817 1818 case *tcpip.TCPUserTimeoutOption: 1819 e.LockUser() 1820 e.userTimeout = time.Duration(*v) 1821 e.UnlockUser() 1822 1823 case *tcpip.CongestionControlOption: 1824 // Query the available cc algorithms in the stack and 1825 // validate that the specified algorithm is actually 1826 // supported in the stack. 1827 var avail tcpip.TCPAvailableCongestionControlOption 1828 if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil { 1829 return err 1830 } 1831 availCC := strings.Split(string(avail), " ") 1832 for _, cc := range availCC { 1833 if *v == tcpip.CongestionControlOption(cc) { 1834 e.LockUser() 1835 state := e.EndpointState() 1836 e.cc = *v 1837 switch state { 1838 case StateEstablished: 1839 if e.EndpointState() == state { 1840 e.snd.cc = e.snd.initCongestionControl(e.cc) 1841 } 1842 } 1843 e.UnlockUser() 1844 return nil 1845 } 1846 } 1847 1848 // Linux returns ENOENT when an invalid congestion 1849 // control algorithm is specified. 1850 return &tcpip.ErrNoSuchFile{} 1851 1852 case *tcpip.TCPLingerTimeoutOption: 1853 e.LockUser() 1854 1855 switch { 1856 case *v < 0: 1857 // Same as effectively disabling TCPLinger timeout. 1858 *v = -1 1859 case *v == 0: 1860 // Same as the stack default. 1861 var stackLingerTimeout tcpip.TCPLingerTimeoutOption 1862 if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil { 1863 panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err)) 1864 } 1865 *v = stackLingerTimeout 1866 case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout): 1867 // Cap it to Stack's default TCP_LINGER2 timeout. 1868 *v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout) 1869 default: 1870 } 1871 1872 e.tcpLingerTimeout = time.Duration(*v) 1873 e.UnlockUser() 1874 1875 case *tcpip.TCPDeferAcceptOption: 1876 e.LockUser() 1877 if time.Duration(*v) > MaxRTO { 1878 *v = tcpip.TCPDeferAcceptOption(MaxRTO) 1879 } 1880 e.deferAccept = time.Duration(*v) 1881 e.UnlockUser() 1882 1883 case *tcpip.SocketDetachFilterOption: 1884 return nil 1885 1886 default: 1887 return nil 1888 } 1889 return nil 1890 } 1891 1892 // readyReceiveSize returns the number of bytes ready to be received. 1893 func (e *endpoint) readyReceiveSize() (int, tcpip.Error) { 1894 e.LockUser() 1895 defer e.UnlockUser() 1896 1897 // The endpoint cannot be in listen state. 1898 if e.EndpointState() == StateListen { 1899 return 0, &tcpip.ErrInvalidEndpointState{} 1900 } 1901 1902 e.rcvQueueInfo.rcvQueueMu.Lock() 1903 defer e.rcvQueueInfo.rcvQueueMu.Unlock() 1904 1905 return e.rcvQueueInfo.RcvBufUsed, nil 1906 } 1907 1908 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. 1909 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { 1910 switch opt { 1911 case tcpip.KeepaliveCountOption: 1912 e.keepalive.Lock() 1913 v := e.keepalive.count 1914 e.keepalive.Unlock() 1915 return v, nil 1916 1917 case tcpip.IPv4TOSOption: 1918 e.LockUser() 1919 v := int(e.sendTOS) 1920 e.UnlockUser() 1921 return v, nil 1922 1923 case tcpip.IPv6TrafficClassOption: 1924 e.LockUser() 1925 v := int(e.sendTOS) 1926 e.UnlockUser() 1927 return v, nil 1928 1929 case tcpip.MaxSegOption: 1930 // This is just stubbed out. Linux never returns the user_mss 1931 // value as it either returns the defaultMSS or returns the 1932 // actual current MSS. Netstack just returns the defaultMSS 1933 // always for now. 1934 v := header.TCPDefaultMSS 1935 return v, nil 1936 1937 case tcpip.MTUDiscoverOption: 1938 // Always return the path MTU discovery disabled setting since 1939 // it's the only one supported. 1940 return tcpip.PMTUDiscoveryDont, nil 1941 1942 case tcpip.ReceiveQueueSizeOption: 1943 return e.readyReceiveSize() 1944 1945 case tcpip.TTLOption: 1946 e.LockUser() 1947 v := int(e.ttl) 1948 e.UnlockUser() 1949 return v, nil 1950 1951 case tcpip.TCPSynCountOption: 1952 e.LockUser() 1953 v := int(e.maxSynRetries) 1954 e.UnlockUser() 1955 return v, nil 1956 1957 case tcpip.TCPWindowClampOption: 1958 e.LockUser() 1959 v := int(e.windowClamp) 1960 e.UnlockUser() 1961 return v, nil 1962 1963 case tcpip.MulticastTTLOption: 1964 return 1, nil 1965 1966 default: 1967 return -1, &tcpip.ErrUnknownProtocolOption{} 1968 } 1969 } 1970 1971 func (e *endpoint) getTCPInfo() tcpip.TCPInfoOption { 1972 info := tcpip.TCPInfoOption{} 1973 e.LockUser() 1974 if state := e.EndpointState(); state.internal() { 1975 info.State = tcpip.EndpointState(StateClose) 1976 } else { 1977 info.State = tcpip.EndpointState(state) 1978 } 1979 snd := e.snd 1980 if snd != nil { 1981 // We do not calculate RTT before sending the data packets. If 1982 // the connection did not send and receive data, then RTT will 1983 // be zero. 1984 snd.rtt.Lock() 1985 info.RTT = snd.rtt.TCPRTTState.SRTT 1986 info.RTTVar = snd.rtt.TCPRTTState.RTTVar 1987 snd.rtt.Unlock() 1988 1989 info.RTO = snd.RTO 1990 info.CcState = snd.state 1991 info.SndSsthresh = uint32(snd.Ssthresh) 1992 info.SndCwnd = uint32(snd.SndCwnd) 1993 info.ReorderSeen = snd.rc.Reord 1994 } 1995 e.UnlockUser() 1996 return info 1997 } 1998 1999 // GetSockOpt implements tcpip.Endpoint.GetSockOpt. 2000 func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { 2001 switch o := opt.(type) { 2002 case *tcpip.TCPInfoOption: 2003 *o = e.getTCPInfo() 2004 2005 case *tcpip.KeepaliveIdleOption: 2006 e.keepalive.Lock() 2007 *o = tcpip.KeepaliveIdleOption(e.keepalive.idle) 2008 e.keepalive.Unlock() 2009 2010 case *tcpip.KeepaliveIntervalOption: 2011 e.keepalive.Lock() 2012 *o = tcpip.KeepaliveIntervalOption(e.keepalive.interval) 2013 e.keepalive.Unlock() 2014 2015 case *tcpip.TCPUserTimeoutOption: 2016 e.LockUser() 2017 *o = tcpip.TCPUserTimeoutOption(e.userTimeout) 2018 e.UnlockUser() 2019 2020 case *tcpip.CongestionControlOption: 2021 e.LockUser() 2022 *o = e.cc 2023 e.UnlockUser() 2024 2025 case *tcpip.TCPLingerTimeoutOption: 2026 e.LockUser() 2027 *o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout) 2028 e.UnlockUser() 2029 2030 case *tcpip.TCPDeferAcceptOption: 2031 e.LockUser() 2032 *o = tcpip.TCPDeferAcceptOption(e.deferAccept) 2033 e.UnlockUser() 2034 2035 case *tcpip.OriginalDestinationOption: 2036 e.LockUser() 2037 ipt := e.stack.IPTables() 2038 addr, port, err := ipt.OriginalDst(e.TransportEndpointInfo.ID, e.NetProto) 2039 e.UnlockUser() 2040 if err != nil { 2041 return err 2042 } 2043 *o = tcpip.OriginalDestinationOption{ 2044 Addr: addr, 2045 Port: port, 2046 } 2047 2048 default: 2049 return &tcpip.ErrUnknownProtocolOption{} 2050 } 2051 return nil 2052 } 2053 2054 // checkV4MappedLocked determines the effective network protocol and converts 2055 // addr to its canonical form. 2056 func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) { 2057 unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only()) 2058 if err != nil { 2059 return tcpip.FullAddress{}, 0, err 2060 } 2061 return unwrapped, netProto, nil 2062 } 2063 2064 // Disconnect implements tcpip.Endpoint.Disconnect. 2065 func (*endpoint) Disconnect() tcpip.Error { 2066 return &tcpip.ErrNotSupported{} 2067 } 2068 2069 // Connect connects the endpoint to its peer. 2070 func (e *endpoint) Connect(addr tcpip.FullAddress) tcpip.Error { 2071 err := e.connect(addr, true, true) 2072 if err != nil { 2073 if !err.IgnoreStats() { 2074 // Connect failed. Let's wake up any waiters. 2075 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 2076 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 2077 e.stats.FailedConnectionAttempts.Increment() 2078 } 2079 } 2080 return err 2081 } 2082 2083 // connect connects the endpoint to its peer. In the normal non-S/R case, the 2084 // new connection is expected to run the main goroutine and perform handshake. 2085 // In restore of previously connected endpoints, both ends will be passively 2086 // created (so no new handshaking is done); for stack-accepted connections not 2087 // yet accepted by the app, they are restored without running the main goroutine 2088 // here. 2089 func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) tcpip.Error { 2090 e.LockUser() 2091 defer e.UnlockUser() 2092 2093 connectingAddr := addr.Addr 2094 2095 addr, netProto, err := e.checkV4MappedLocked(addr) 2096 if err != nil { 2097 return err 2098 } 2099 2100 if e.EndpointState().connected() { 2101 // The endpoint is already connected. If caller hasn't been 2102 // notified yet, return success. 2103 if !e.isConnectNotified { 2104 e.isConnectNotified = true 2105 return nil 2106 } 2107 // Otherwise return that it's already connected. 2108 return &tcpip.ErrAlreadyConnected{} 2109 } 2110 2111 nicID := addr.NIC 2112 switch e.EndpointState() { 2113 case StateBound: 2114 // If we're already bound to a NIC but the caller is requesting 2115 // that we use a different one now, we cannot proceed. 2116 if e.boundNICID == 0 { 2117 break 2118 } 2119 2120 if nicID != 0 && nicID != e.boundNICID { 2121 return &tcpip.ErrNoRoute{} 2122 } 2123 2124 nicID = e.boundNICID 2125 2126 case StateInitial: 2127 // Nothing to do. We'll eventually fill-in the gaps in the ID (if any) 2128 // when we find a route. 2129 2130 case StateConnecting, StateSynSent, StateSynRecv: 2131 // A connection request has already been issued but hasn't completed 2132 // yet. 2133 return &tcpip.ErrAlreadyConnecting{} 2134 2135 case StateError: 2136 if err := e.hardErrorLocked(); err != nil { 2137 return err 2138 } 2139 return &tcpip.ErrConnectionAborted{} 2140 2141 default: 2142 return &tcpip.ErrInvalidEndpointState{} 2143 } 2144 2145 // Find a route to the desired destination. 2146 r, err := e.stack.FindRoute(nicID, e.TransportEndpointInfo.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */) 2147 if err != nil { 2148 return err 2149 } 2150 defer r.Release() 2151 2152 netProtos := []tcpip.NetworkProtocolNumber{netProto} 2153 e.TransportEndpointInfo.ID.LocalAddress = r.LocalAddress() 2154 e.TransportEndpointInfo.ID.RemoteAddress = r.RemoteAddress() 2155 e.TransportEndpointInfo.ID.RemotePort = addr.Port 2156 2157 if e.TransportEndpointInfo.ID.LocalPort != 0 { 2158 // The endpoint is bound to a port, attempt to register it. 2159 err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 2160 if err != nil { 2161 return err 2162 } 2163 } else { 2164 // The endpoint doesn't have a local port yet, so try to get 2165 // one. Make sure that it isn't one that will result in the same 2166 // address/port for both local and remote (otherwise this 2167 // endpoint would be trying to connect to itself). 2168 sameAddr := e.TransportEndpointInfo.ID.LocalAddress == e.TransportEndpointInfo.ID.RemoteAddress 2169 2170 // Calculate a port offset based on the destination IP/port and 2171 // src IP to ensure that for a given tuple (srcIP, destIP, 2172 // destPort) the offset used as a starting point is the same to 2173 // ensure that we can cycle through the port space effectively. 2174 portBuf := make([]byte, 2) 2175 binary.LittleEndian.PutUint16(portBuf, e.ID.RemotePort) 2176 2177 h := jenkins.Sum32(e.stack.Seed()) 2178 for _, s := range [][]byte{ 2179 []byte(e.ID.LocalAddress), 2180 []byte(e.ID.RemoteAddress), 2181 portBuf, 2182 } { 2183 // Per io.Writer.Write: 2184 // 2185 // Write must return a non-nil error if it returns n < len(p). 2186 if _, err := h.Write(s); err != nil { 2187 panic(err) 2188 } 2189 } 2190 portOffset := h.Sum32() 2191 2192 var twReuse tcpip.TCPTimeWaitReuseOption 2193 if err := e.stack.TransportProtocolOption(ProtocolNumber, &twReuse); err != nil { 2194 panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &twReuse, err)) 2195 } 2196 2197 reuse := twReuse == tcpip.TCPTimeWaitReuseGlobal 2198 if twReuse == tcpip.TCPTimeWaitReuseLoopbackOnly { 2199 switch netProto { 2200 case header.IPv4ProtocolNumber: 2201 reuse = header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.LocalAddress) && header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.RemoteAddress) 2202 case header.IPv6ProtocolNumber: 2203 reuse = e.TransportEndpointInfo.ID.LocalAddress == header.IPv6Loopback && e.TransportEndpointInfo.ID.RemoteAddress == header.IPv6Loopback 2204 } 2205 } 2206 2207 bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) 2208 if _, err := e.stack.PickEphemeralPortStable(portOffset, func(p uint16) (bool, tcpip.Error) { 2209 if sameAddr && p == e.TransportEndpointInfo.ID.RemotePort { 2210 return false, nil 2211 } 2212 portRes := ports.Reservation{ 2213 Networks: netProtos, 2214 Transport: ProtocolNumber, 2215 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2216 Port: p, 2217 Flags: e.portFlags, 2218 BindToDevice: bindToDevice, 2219 Dest: addr, 2220 } 2221 if _, err := e.stack.ReservePort(e.stack.Rand(), portRes, nil /* testPort */); err != nil { 2222 if _, ok := err.(*tcpip.ErrPortInUse); !ok || !reuse { 2223 return false, nil 2224 } 2225 transEPID := e.TransportEndpointInfo.ID 2226 transEPID.LocalPort = p 2227 // Check if an endpoint is registered with demuxer in TIME-WAIT and if 2228 // we can reuse it. If we can't find a transport endpoint then we just 2229 // skip using this port as it's possible that either an endpoint has 2230 // bound the port but not registered with demuxer yet (no listen/connect 2231 // done yet) or the reservation was freed between the check above and 2232 // the FindTransportEndpoint below. But rather than retry the same port 2233 // we just skip it and move on. 2234 transEP := e.stack.FindTransportEndpoint(netProto, ProtocolNumber, transEPID, r.NICID()) 2235 if transEP == nil { 2236 // ReservePort failed but there is no registered endpoint with 2237 // demuxer. Which indicates there is at least some endpoint that has 2238 // bound the port. 2239 return false, nil 2240 } 2241 2242 tcpEP := transEP.(*endpoint) 2243 tcpEP.LockUser() 2244 // If the endpoint is not in TIME-WAIT or if it is in TIME-WAIT but 2245 // less than 1 second has elapsed since its recentTS was updated then 2246 // we cannot reuse the port. 2247 if tcpEP.EndpointState() != StateTimeWait || e.stack.Clock().NowMonotonic().Sub(tcpEP.recentTSTime) < 1*time.Second { 2248 tcpEP.UnlockUser() 2249 return false, nil 2250 } 2251 // Since the endpoint is in TIME-WAIT it should be safe to acquire its 2252 // Lock while holding the lock for this endpoint as endpoints in 2253 // TIME-WAIT do not acquire locks on other endpoints. 2254 tcpEP.workerCleanup = false 2255 tcpEP.cleanupLocked() 2256 tcpEP.notifyProtocolGoroutine(notifyAbort) 2257 tcpEP.UnlockUser() 2258 // Now try and Reserve again if it fails then we skip. 2259 portRes := ports.Reservation{ 2260 Networks: netProtos, 2261 Transport: ProtocolNumber, 2262 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2263 Port: p, 2264 Flags: e.portFlags, 2265 BindToDevice: bindToDevice, 2266 Dest: addr, 2267 } 2268 if _, err := e.stack.ReservePort(e.stack.Rand(), portRes, nil /* testPort */); err != nil { 2269 return false, nil 2270 } 2271 } 2272 2273 id := e.TransportEndpointInfo.ID 2274 id.LocalPort = p 2275 if err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.portFlags, bindToDevice); err != nil { 2276 portRes := ports.Reservation{ 2277 Networks: netProtos, 2278 Transport: ProtocolNumber, 2279 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2280 Port: p, 2281 Flags: e.portFlags, 2282 BindToDevice: bindToDevice, 2283 Dest: addr, 2284 } 2285 e.stack.ReleasePort(portRes) 2286 if _, ok := err.(*tcpip.ErrPortInUse); ok { 2287 return false, nil 2288 } 2289 return false, err 2290 } 2291 2292 // Port picking successful. Save the details of 2293 // the selected port. 2294 e.TransportEndpointInfo.ID = id 2295 e.isPortReserved = true 2296 e.boundBindToDevice = bindToDevice 2297 e.boundPortFlags = e.portFlags 2298 e.boundDest = addr 2299 return true, nil 2300 }); err != nil { 2301 e.stack.Stats().TCP.FailedPortReservations.Increment() 2302 return err 2303 } 2304 } 2305 2306 e.isRegistered = true 2307 e.setEndpointState(StateConnecting) 2308 r.Acquire() 2309 e.route = r 2310 e.boundNICID = nicID 2311 e.effectiveNetProtos = netProtos 2312 e.connectingAddress = connectingAddr 2313 2314 e.initGSO() 2315 2316 // Connect in the restore phase does not perform handshake. Restore its 2317 // connection setting here. 2318 if !handshake { 2319 e.segmentQueue.mu.Lock() 2320 for _, l := range []segmentList{e.segmentQueue.list, e.snd.writeList} { 2321 for s := l.Front(); s != nil; s = s.Next() { 2322 s.id = e.TransportEndpointInfo.ID 2323 e.sndQueueInfo.sndWaker.Assert() 2324 } 2325 } 2326 e.segmentQueue.mu.Unlock() 2327 e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0) 2328 e.setEndpointState(StateEstablished) 2329 } 2330 2331 if run { 2332 if handshake { 2333 h := e.newHandshake() 2334 e.setEndpointState(StateSynSent) 2335 h.start() 2336 } 2337 e.stack.Stats().TCP.ActiveConnectionOpenings.Increment() 2338 e.workerRunning = true 2339 go e.protocolMainLoop(handshake, nil) // S/R-SAFE: will be drained before save. 2340 } 2341 2342 return &tcpip.ErrConnectStarted{} 2343 } 2344 2345 // ConnectEndpoint is not supported. 2346 func (*endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error { 2347 return &tcpip.ErrInvalidEndpointState{} 2348 } 2349 2350 // Shutdown closes the read and/or write end of the endpoint connection to its 2351 // peer. 2352 func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error { 2353 e.LockUser() 2354 defer e.UnlockUser() 2355 return e.shutdownLocked(flags) 2356 } 2357 2358 func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error { 2359 e.shutdownFlags |= flags 2360 switch { 2361 case e.EndpointState().connected(): 2362 // Close for read. 2363 if e.shutdownFlags&tcpip.ShutdownRead != 0 { 2364 // Mark read side as closed. 2365 e.rcvQueueInfo.rcvQueueMu.Lock() 2366 e.rcvQueueInfo.RcvClosed = true 2367 rcvBufUsed := e.rcvQueueInfo.RcvBufUsed 2368 e.rcvQueueInfo.rcvQueueMu.Unlock() 2369 2370 // If we're fully closed and we have unread data we need to abort 2371 // the connection with a RST. 2372 if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 { 2373 e.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) 2374 // Wake up worker to terminate loop. 2375 e.notifyProtocolGoroutine(notifyTickleWorker) 2376 return nil 2377 } 2378 // Wake up any readers that maybe waiting for the stream to become 2379 // readable. 2380 e.waiterQueue.Notify(waiter.ReadableEvents) 2381 } 2382 2383 // Close for write. 2384 if e.shutdownFlags&tcpip.ShutdownWrite != 0 { 2385 e.sndQueueInfo.sndQueueMu.Lock() 2386 if e.sndQueueInfo.SndClosed { 2387 // Already closed. 2388 e.sndQueueInfo.sndQueueMu.Unlock() 2389 if e.EndpointState() == StateTimeWait { 2390 return &tcpip.ErrNotConnected{} 2391 } 2392 return nil 2393 } 2394 2395 // Queue fin segment. 2396 s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), nil) 2397 e.snd.writeList.PushBack(s) 2398 // Mark endpoint as closed. 2399 e.sndQueueInfo.SndClosed = true 2400 e.sndQueueInfo.sndQueueMu.Unlock() 2401 2402 // Drain the send queue. 2403 e.sendData(s) 2404 2405 // Mark send side as closed. 2406 e.snd.Closed = true 2407 2408 // Wake up any writers that maybe waiting for the stream to become 2409 // writable. 2410 e.waiterQueue.Notify(waiter.WritableEvents) 2411 } 2412 2413 return nil 2414 case e.EndpointState() == StateListen: 2415 if e.shutdownFlags&tcpip.ShutdownRead != 0 { 2416 // Reset all connections from the accept queue and keep the 2417 // worker running so that it can continue handling incoming 2418 // segments by replying with RST. 2419 // 2420 // By not removing this endpoint from the demuxer mapping, we 2421 // ensure that any other bind to the same port fails, as on Linux. 2422 e.rcvQueueInfo.rcvQueueMu.Lock() 2423 e.rcvQueueInfo.RcvClosed = true 2424 e.rcvQueueInfo.rcvQueueMu.Unlock() 2425 e.closePendingAcceptableConnectionsLocked() 2426 // Notify waiters that the endpoint is shutdown. 2427 e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) 2428 } 2429 return nil 2430 default: 2431 return &tcpip.ErrNotConnected{} 2432 } 2433 } 2434 2435 // Listen puts the endpoint in "listen" mode, which allows it to accept 2436 // new connections. 2437 func (e *endpoint) Listen(backlog int) tcpip.Error { 2438 err := e.listen(backlog) 2439 if err != nil { 2440 if !err.IgnoreStats() { 2441 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 2442 e.stats.FailedConnectionAttempts.Increment() 2443 } 2444 } 2445 return err 2446 } 2447 2448 func (e *endpoint) listen(backlog int) tcpip.Error { 2449 e.LockUser() 2450 defer e.UnlockUser() 2451 2452 if e.EndpointState() == StateListen && !e.closed { 2453 e.acceptMu.Lock() 2454 defer e.acceptMu.Unlock() 2455 if e.accepted == (accepted{}) { 2456 // listen is called after shutdown. 2457 e.accepted.cap = backlog 2458 e.shutdownFlags = 0 2459 e.rcvQueueInfo.rcvQueueMu.Lock() 2460 e.rcvQueueInfo.RcvClosed = false 2461 e.rcvQueueInfo.rcvQueueMu.Unlock() 2462 } else { 2463 // Adjust the size of the backlog iff we can fit 2464 // existing pending connections into the new one. 2465 if e.accepted.endpoints.Len() > backlog { 2466 return &tcpip.ErrInvalidEndpointState{} 2467 } 2468 e.accepted.cap = backlog 2469 } 2470 2471 // Notify any blocked goroutines that they can attempt to 2472 // deliver endpoints again. 2473 e.acceptCond.Broadcast() 2474 2475 return nil 2476 } 2477 2478 if e.EndpointState() == StateInitial { 2479 // The listen is called on an unbound socket, the socket is 2480 // automatically bound to a random free port with the local 2481 // address set to INADDR_ANY. 2482 if err := e.bindLocked(tcpip.FullAddress{}); err != nil { 2483 return err 2484 } 2485 } 2486 2487 // Endpoint must be bound before it can transition to listen mode. 2488 if e.EndpointState() != StateBound { 2489 e.stats.ReadErrors.InvalidEndpointState.Increment() 2490 return &tcpip.ErrInvalidEndpointState{} 2491 } 2492 2493 // Register the endpoint. 2494 if err := e.stack.RegisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil { 2495 return err 2496 } 2497 2498 e.isRegistered = true 2499 e.setEndpointState(StateListen) 2500 2501 // The queue may be non-zero when we're restoring the endpoint, and it 2502 // may be pre-populated with some previously accepted (but not Accepted) 2503 // endpoints. 2504 e.acceptMu.Lock() 2505 if e.accepted == (accepted{}) { 2506 e.accepted.cap = backlog 2507 } 2508 e.acceptMu.Unlock() 2509 2510 e.workerRunning = true 2511 go e.protocolListenLoop( // S/R-SAFE: drained on save. 2512 seqnum.Size(e.receiveBufferAvailable())) 2513 return nil 2514 } 2515 2516 // startAcceptedLoop sets up required state and starts a goroutine with the 2517 // main loop for accepted connections. 2518 // +checklocksrelease:e.mu 2519 func (e *endpoint) startAcceptedLoop() { 2520 e.workerRunning = true 2521 e.mu.Unlock() 2522 wakerInitDone := make(chan struct{}) 2523 go e.protocolMainLoop(false, wakerInitDone) // S/R-SAFE: drained on save. 2524 <-wakerInitDone 2525 } 2526 2527 // Accept returns a new endpoint if a peer has established a connection 2528 // to an endpoint previously set to listen mode. 2529 // 2530 // addr if not-nil will contain the peer address of the returned endpoint. 2531 func (e *endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) { 2532 e.LockUser() 2533 defer e.UnlockUser() 2534 2535 e.rcvQueueInfo.rcvQueueMu.Lock() 2536 rcvClosed := e.rcvQueueInfo.RcvClosed 2537 e.rcvQueueInfo.rcvQueueMu.Unlock() 2538 // Endpoint must be in listen state before it can accept connections. 2539 if rcvClosed || e.EndpointState() != StateListen { 2540 return nil, nil, &tcpip.ErrInvalidEndpointState{} 2541 } 2542 2543 // Get the new accepted endpoint. 2544 var n *endpoint 2545 e.acceptMu.Lock() 2546 if element := e.accepted.endpoints.Front(); element != nil { 2547 n = e.accepted.endpoints.Remove(element).(*endpoint) 2548 } 2549 e.acceptMu.Unlock() 2550 if n == nil { 2551 return nil, nil, &tcpip.ErrWouldBlock{} 2552 } 2553 e.acceptCond.Signal() 2554 if peerAddr != nil { 2555 *peerAddr = n.getRemoteAddress() 2556 } 2557 return n, n.waiterQueue, nil 2558 } 2559 2560 // Bind binds the endpoint to a specific local port and optionally address. 2561 func (e *endpoint) Bind(addr tcpip.FullAddress) (err tcpip.Error) { 2562 e.LockUser() 2563 defer e.UnlockUser() 2564 2565 return e.bindLocked(addr) 2566 } 2567 2568 func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err tcpip.Error) { 2569 // Don't allow binding once endpoint is not in the initial state 2570 // anymore. This is because once the endpoint goes into a connected or 2571 // listen state, it is already bound. 2572 if e.EndpointState() != StateInitial { 2573 return &tcpip.ErrAlreadyBound{} 2574 } 2575 2576 e.BindAddr = addr.Addr 2577 addr, netProto, err := e.checkV4MappedLocked(addr) 2578 if err != nil { 2579 return err 2580 } 2581 2582 netProtos := []tcpip.NetworkProtocolNumber{netProto} 2583 2584 // Expand netProtos to include v4 and v6 under dual-stack if the caller is 2585 // binding to a wildcard (empty) address, and this is an IPv6 endpoint with 2586 // v6only set to false. 2587 if netProto == header.IPv6ProtocolNumber { 2588 stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber) 2589 alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == "" && stackHasV4 2590 if alsoBindToV4 { 2591 netProtos = append(netProtos, header.IPv4ProtocolNumber) 2592 } 2593 } 2594 2595 var nic tcpip.NICID 2596 // If an address is specified, we must ensure that it's one of our 2597 // local addresses. 2598 if len(addr.Addr) != 0 { 2599 nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) 2600 if nic == 0 { 2601 return &tcpip.ErrBadLocalAddress{} 2602 } 2603 e.TransportEndpointInfo.ID.LocalAddress = addr.Addr 2604 } 2605 2606 bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) 2607 portRes := ports.Reservation{ 2608 Networks: netProtos, 2609 Transport: ProtocolNumber, 2610 Addr: addr.Addr, 2611 Port: addr.Port, 2612 Flags: e.portFlags, 2613 BindToDevice: bindToDevice, 2614 Dest: tcpip.FullAddress{}, 2615 } 2616 port, err := e.stack.ReservePort(e.stack.Rand(), portRes, func(p uint16) (bool, tcpip.Error) { 2617 id := e.TransportEndpointInfo.ID 2618 id.LocalPort = p 2619 // CheckRegisterTransportEndpoint should only return an error if there is a 2620 // listening endpoint bound with the same id and portFlags and bindToDevice 2621 // options. 2622 // 2623 // NOTE: Only listening and connected endpoint register with 2624 // demuxer. Further connected endpoints always have a remote 2625 // address/port. Hence this will only return an error if there is a matching 2626 // listening endpoint. 2627 if err := e.stack.CheckRegisterTransportEndpoint(netProtos, ProtocolNumber, id, e.portFlags, bindToDevice); err != nil { 2628 return false, nil 2629 } 2630 return true, nil 2631 }) 2632 if err != nil { 2633 e.stack.Stats().TCP.FailedPortReservations.Increment() 2634 return err 2635 } 2636 2637 e.boundBindToDevice = bindToDevice 2638 e.boundPortFlags = e.portFlags 2639 // TODO(github.com/SagerNet/issue/3691): Add test to verify boundNICID is correct. 2640 e.boundNICID = nic 2641 e.isPortReserved = true 2642 e.effectiveNetProtos = netProtos 2643 e.TransportEndpointInfo.ID.LocalPort = port 2644 2645 // Mark endpoint as bound. 2646 e.setEndpointState(StateBound) 2647 2648 return nil 2649 } 2650 2651 // GetLocalAddress returns the address to which the endpoint is bound. 2652 func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { 2653 e.LockUser() 2654 defer e.UnlockUser() 2655 2656 return tcpip.FullAddress{ 2657 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2658 Port: e.TransportEndpointInfo.ID.LocalPort, 2659 NIC: e.boundNICID, 2660 }, nil 2661 } 2662 2663 // GetRemoteAddress returns the address to which the endpoint is connected. 2664 func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { 2665 e.LockUser() 2666 defer e.UnlockUser() 2667 2668 if !e.EndpointState().connected() { 2669 return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} 2670 } 2671 2672 return e.getRemoteAddress(), nil 2673 } 2674 2675 func (e *endpoint) getRemoteAddress() tcpip.FullAddress { 2676 return tcpip.FullAddress{ 2677 Addr: e.TransportEndpointInfo.ID.RemoteAddress, 2678 Port: e.TransportEndpointInfo.ID.RemotePort, 2679 NIC: e.boundNICID, 2680 } 2681 } 2682 2683 func (*endpoint) HandlePacket(stack.TransportEndpointID, *stack.PacketBuffer) { 2684 // TCP HandlePacket is not required anymore as inbound packets first 2685 // land at the Dispatcher which then can either deliver using the 2686 // worker go routine or directly do the invoke the tcp processing inline 2687 // based on the state of the endpoint. 2688 } 2689 2690 func (e *endpoint) enqueueSegment(s *segment) bool { 2691 // Send packet to worker goroutine. 2692 if !e.segmentQueue.enqueue(s) { 2693 // The queue is full, so we drop the segment. 2694 e.stack.Stats().DroppedPackets.Increment() 2695 e.stats.ReceiveErrors.SegmentQueueDropped.Increment() 2696 return false 2697 } 2698 return true 2699 } 2700 2701 func (e *endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, pkt *stack.PacketBuffer) { 2702 // Update last error first. 2703 e.lastErrorMu.Lock() 2704 e.lastError = err 2705 e.lastErrorMu.Unlock() 2706 2707 // Update the error queue if IP_RECVERR is enabled. 2708 if e.SocketOptions().GetRecvError() { 2709 e.SocketOptions().QueueErr(&tcpip.SockError{ 2710 Err: err, 2711 Cause: transErr, 2712 // Linux passes the payload with the TCP header. We don't know if the TCP 2713 // header even exists, it may not for fragmented packets. 2714 Payload: pkt.Data().AsRange().ToOwnedView(), 2715 Dst: tcpip.FullAddress{ 2716 NIC: pkt.NICID, 2717 Addr: e.TransportEndpointInfo.ID.RemoteAddress, 2718 Port: e.TransportEndpointInfo.ID.RemotePort, 2719 }, 2720 Offender: tcpip.FullAddress{ 2721 NIC: pkt.NICID, 2722 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2723 Port: e.TransportEndpointInfo.ID.LocalPort, 2724 }, 2725 NetProto: pkt.NetworkProtocolNumber, 2726 }) 2727 } 2728 2729 // Notify of the error. 2730 e.notifyProtocolGoroutine(notifyError) 2731 } 2732 2733 // HandleError implements stack.TransportEndpoint. 2734 func (e *endpoint) HandleError(transErr stack.TransportError, pkt *stack.PacketBuffer) { 2735 handlePacketTooBig := func(mtu uint32) { 2736 e.sndQueueInfo.sndQueueMu.Lock() 2737 e.sndQueueInfo.PacketTooBigCount++ 2738 if v := int(mtu); v < e.sndQueueInfo.SndMTU { 2739 e.sndQueueInfo.SndMTU = v 2740 } 2741 e.sndQueueInfo.sndQueueMu.Unlock() 2742 e.notifyProtocolGoroutine(notifyMTUChanged) 2743 } 2744 2745 // TODO(github.com/SagerNet/issues/5270): Handle all transport errors. 2746 switch transErr.Kind() { 2747 case stack.PacketTooBigTransportError: 2748 handlePacketTooBig(transErr.Info()) 2749 case stack.DestinationHostUnreachableTransportError: 2750 e.onICMPError(&tcpip.ErrNoRoute{}, transErr, pkt) 2751 case stack.DestinationNetworkUnreachableTransportError: 2752 e.onICMPError(&tcpip.ErrNetworkUnreachable{}, transErr, pkt) 2753 } 2754 } 2755 2756 // updateSndBufferUsage is called by the protocol goroutine when room opens up 2757 // in the send buffer. The number of newly available bytes is v. 2758 func (e *endpoint) updateSndBufferUsage(v int) { 2759 sendBufferSize := e.getSendBufferSize() 2760 e.sndQueueInfo.sndQueueMu.Lock() 2761 notify := e.sndQueueInfo.SndBufUsed >= sendBufferSize>>1 2762 e.sndQueueInfo.SndBufUsed -= v 2763 // We only notify when there is half the sendBufferSize available after 2764 // a full buffer event occurs. This ensures that we don't wake up 2765 // writers to queue just 1-2 segments and go back to sleep. 2766 notify = notify && e.sndQueueInfo.SndBufUsed < sendBufferSize>>1 2767 e.sndQueueInfo.sndQueueMu.Unlock() 2768 2769 if notify { 2770 e.waiterQueue.Notify(waiter.WritableEvents) 2771 } 2772 } 2773 2774 // readyToRead is called by the protocol goroutine when a new segment is ready 2775 // to be read, or when the connection is closed for receiving (in which case 2776 // s will be nil). 2777 func (e *endpoint) readyToRead(s *segment) { 2778 e.rcvQueueInfo.rcvQueueMu.Lock() 2779 if s != nil { 2780 e.rcvQueueInfo.RcvBufUsed += s.payloadSize() 2781 s.incRef() 2782 e.rcvQueueInfo.rcvQueue.PushBack(s) 2783 } else { 2784 e.rcvQueueInfo.RcvClosed = true 2785 } 2786 e.rcvQueueInfo.rcvQueueMu.Unlock() 2787 e.waiterQueue.Notify(waiter.ReadableEvents) 2788 } 2789 2790 // receiveBufferAvailableLocked calculates how many bytes are still available 2791 // in the receive buffer. 2792 // rcvQueueMu must be held when this function is called. 2793 func (e *endpoint) receiveBufferAvailableLocked(rcvBufSize int) int { 2794 // We may use more bytes than the buffer size when the receive buffer 2795 // shrinks. 2796 memUsed := e.receiveMemUsed() 2797 if memUsed >= rcvBufSize { 2798 return 0 2799 } 2800 2801 return rcvBufSize - memUsed 2802 } 2803 2804 // receiveBufferAvailable calculates how many bytes are still available in the 2805 // receive buffer based on the actual memory used by all segments held in 2806 // receive buffer/pending and segment queue. 2807 func (e *endpoint) receiveBufferAvailable() int { 2808 e.rcvQueueInfo.rcvQueueMu.Lock() 2809 available := e.receiveBufferAvailableLocked(int(e.ops.GetReceiveBufferSize())) 2810 e.rcvQueueInfo.rcvQueueMu.Unlock() 2811 return available 2812 } 2813 2814 // receiveBufferUsed returns the amount of in-use receive buffer. 2815 func (e *endpoint) receiveBufferUsed() int { 2816 e.rcvQueueInfo.rcvQueueMu.Lock() 2817 used := e.rcvQueueInfo.RcvBufUsed 2818 e.rcvQueueInfo.rcvQueueMu.Unlock() 2819 return used 2820 } 2821 2822 // receiveMemUsed returns the total memory in use by segments held by this 2823 // endpoint. 2824 func (e *endpoint) receiveMemUsed() int { 2825 return int(atomic.LoadInt32(&e.rcvMemUsed)) 2826 } 2827 2828 // updateReceiveMemUsed adds the provided delta to e.rcvMemUsed. 2829 func (e *endpoint) updateReceiveMemUsed(delta int) { 2830 atomic.AddInt32(&e.rcvMemUsed, int32(delta)) 2831 } 2832 2833 // maxReceiveBufferSize returns the stack wide maximum receive buffer size for 2834 // an endpoint. 2835 func (e *endpoint) maxReceiveBufferSize() int { 2836 var rs tcpip.TCPReceiveBufferSizeRangeOption 2837 if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil { 2838 // As a fallback return the hardcoded max buffer size. 2839 return MaxBufferSize 2840 } 2841 return rs.Max 2842 } 2843 2844 // rcvWndScaleForHandshake computes the receive window scale to offer to the 2845 // peer when window scaling is enabled (true by default). If auto-tuning is 2846 // disabled then the window scaling factor is based on the size of the 2847 // receiveBuffer otherwise we use the max permissible receive buffer size to 2848 // compute the scale. 2849 func (e *endpoint) rcvWndScaleForHandshake() int { 2850 bufSizeForScale := e.ops.GetReceiveBufferSize() 2851 2852 e.rcvQueueInfo.rcvQueueMu.Lock() 2853 autoTuningDisabled := e.rcvQueueInfo.RcvAutoParams.Disabled 2854 e.rcvQueueInfo.rcvQueueMu.Unlock() 2855 if autoTuningDisabled { 2856 return FindWndScale(seqnum.Size(bufSizeForScale)) 2857 } 2858 2859 return FindWndScale(seqnum.Size(e.maxReceiveBufferSize())) 2860 } 2861 2862 // updateRecentTimestamp updates the recent timestamp using the algorithm 2863 // described in https://tools.ietf.org/html/rfc7323#section-4.3 2864 func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) { 2865 if e.SendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) { 2866 e.setRecentTimestamp(tsVal) 2867 } 2868 } 2869 2870 // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if 2871 // the SYN options indicate that timestamp option was negotiated. It also 2872 // initializes the recentTS with the value provided in synOpts.TSval. 2873 func (e *endpoint) maybeEnableTimestamp(synOpts *header.TCPSynOptions) { 2874 if synOpts.TS { 2875 e.SendTSOk = true 2876 e.setRecentTimestamp(synOpts.TSVal) 2877 } 2878 } 2879 2880 // timestamp returns the timestamp value to be used in the TSVal field of the 2881 // timestamp option for outgoing TCP segments for a given endpoint. 2882 func (e *endpoint) timestamp() uint32 { 2883 return tcpTimeStamp(e.stack.Clock().NowMonotonic(), e.TSOffset) 2884 } 2885 2886 // tcpTimeStamp returns a timestamp offset by the provided offset. This is 2887 // not inlined above as it's used when SYN cookies are in use and endpoint 2888 // is not created at the time when the SYN cookie is sent. 2889 func tcpTimeStamp(curTime tcpip.MonotonicTime, offset uint32) uint32 { 2890 d := curTime.Sub(tcpip.MonotonicTime{}) 2891 return uint32(d.Milliseconds()) + offset 2892 } 2893 2894 // timeStampOffset returns a randomized timestamp offset to be used when sending 2895 // timestamp values in a timestamp option for a TCP segment. 2896 func timeStampOffset(rng *rand.Rand) uint32 { 2897 // Initialize a random tsOffset that will be added to the recentTS 2898 // everytime the timestamp is sent when the Timestamp option is enabled. 2899 // 2900 // See https://tools.ietf.org/html/rfc7323#section-5.4 for details on 2901 // why this is required. 2902 // 2903 // NOTE: This is not completely to spec as normally this should be 2904 // initialized in a manner analogous to how sequence numbers are 2905 // randomized per connection basis. But for now this is sufficient. 2906 return rng.Uint32() 2907 } 2908 2909 // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint 2910 // if the SYN options indicate that the SACK option was negotiated and the TCP 2911 // stack is configured to enable TCP SACK option. 2912 func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) { 2913 var v tcpip.TCPSACKEnabled 2914 if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil { 2915 // Stack doesn't support SACK. So just return. 2916 return 2917 } 2918 if bool(v) && synOpts.SACKPermitted { 2919 e.SACKPermitted = true 2920 } 2921 } 2922 2923 // maxOptionSize return the maximum size of TCP options. 2924 func (e *endpoint) maxOptionSize() (size int) { 2925 var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock 2926 options := e.makeOptions(maxSackBlocks[:]) 2927 size = len(options) 2928 putOptions(options) 2929 2930 return size 2931 } 2932 2933 // completeStateLocked makes a full copy of the endpoint and returns it. This is 2934 // used before invoking the probe. 2935 // 2936 // Precondition: e.mu must be held. 2937 func (e *endpoint) completeStateLocked() stack.TCPEndpointState { 2938 s := stack.TCPEndpointState{ 2939 TCPEndpointStateInner: e.TCPEndpointStateInner, 2940 ID: stack.TCPEndpointID(e.TransportEndpointInfo.ID), 2941 SegTime: e.stack.Clock().NowMonotonic(), 2942 Receiver: e.rcv.TCPReceiverState, 2943 Sender: e.snd.TCPSenderState, 2944 } 2945 2946 sndBufSize := e.getSendBufferSize() 2947 // Copy the send buffer atomically. 2948 e.sndQueueInfo.sndQueueMu.Lock() 2949 s.SndBufState = e.sndQueueInfo.TCPSndBufState 2950 s.SndBufState.SndBufSize = sndBufSize 2951 e.sndQueueInfo.sndQueueMu.Unlock() 2952 2953 // Copy the receive buffer atomically. 2954 e.rcvQueueInfo.rcvQueueMu.Lock() 2955 s.RcvBufState = e.rcvQueueInfo.TCPRcvBufState 2956 e.rcvQueueInfo.rcvQueueMu.Unlock() 2957 2958 // Copy the endpoint TCP Option state. 2959 s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks) 2960 copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks]) 2961 s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy() 2962 2963 e.snd.rtt.Lock() 2964 s.Sender.RTTState = e.snd.rtt.TCPRTTState 2965 e.snd.rtt.Unlock() 2966 2967 if cubic, ok := e.snd.cc.(*cubicState); ok { 2968 s.Sender.Cubic = cubic.TCPCubicState 2969 s.Sender.Cubic.TimeSinceLastCongestion = e.stack.Clock().NowMonotonic().Sub(s.Sender.Cubic.T) 2970 } 2971 2972 s.Sender.RACKState = e.snd.rc.TCPRACKState 2973 return s 2974 } 2975 2976 func (e *endpoint) initHardwareGSO() { 2977 switch e.route.NetProto() { 2978 case header.IPv4ProtocolNumber: 2979 e.gso.Type = stack.GSOTCPv4 2980 e.gso.L3HdrLen = header.IPv4MinimumSize 2981 case header.IPv6ProtocolNumber: 2982 e.gso.Type = stack.GSOTCPv6 2983 e.gso.L3HdrLen = header.IPv6MinimumSize 2984 default: 2985 panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto)) 2986 } 2987 e.gso.NeedsCsum = true 2988 e.gso.CsumOffset = header.TCPChecksumOffset 2989 e.gso.MaxSize = e.route.GSOMaxSize() 2990 } 2991 2992 func (e *endpoint) initGSO() { 2993 if e.route.HasHardwareGSOCapability() { 2994 e.initHardwareGSO() 2995 } else if e.route.HasSoftwareGSOCapability() { 2996 e.gso = stack.GSO{ 2997 MaxSize: e.route.GSOMaxSize(), 2998 Type: stack.GSOSW, 2999 NeedsCsum: false, 3000 } 3001 } 3002 } 3003 3004 // State implements tcpip.Endpoint.State. It exports the endpoint's protocol 3005 // state for diagnostics. 3006 func (e *endpoint) State() uint32 { 3007 return uint32(e.EndpointState()) 3008 } 3009 3010 // Info returns a copy of the endpoint info. 3011 func (e *endpoint) Info() tcpip.EndpointInfo { 3012 e.LockUser() 3013 // Make a copy of the endpoint info. 3014 ret := e.TransportEndpointInfo 3015 e.UnlockUser() 3016 return &ret 3017 } 3018 3019 // Stats returns a pointer to the endpoint stats. 3020 func (e *endpoint) Stats() tcpip.EndpointStats { 3021 return &e.stats 3022 } 3023 3024 // Wait implements stack.TransportEndpoint.Wait. 3025 func (e *endpoint) Wait() { 3026 waitEntry, notifyCh := waiter.NewChannelEntry(nil) 3027 e.waiterQueue.EventRegister(&waitEntry, waiter.EventHUp) 3028 defer e.waiterQueue.EventUnregister(&waitEntry) 3029 for { 3030 e.LockUser() 3031 running := e.workerRunning 3032 e.UnlockUser() 3033 if !running { 3034 break 3035 } 3036 <-notifyCh 3037 } 3038 } 3039 3040 // SocketOptions implements tcpip.Endpoint.SocketOptions. 3041 func (e *endpoint) SocketOptions() *tcpip.SocketOptions { 3042 return &e.ops 3043 } 3044 3045 // GetTCPSendBufferLimits is used to get send buffer size limits for TCP. 3046 func GetTCPSendBufferLimits(s tcpip.StackHandler) tcpip.SendBufferSizeOption { 3047 var ss tcpip.TCPSendBufferSizeRangeOption 3048 if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil { 3049 panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err)) 3050 } 3051 3052 return tcpip.SendBufferSizeOption{ 3053 Min: ss.Min, 3054 Default: ss.Default, 3055 Max: ss.Max, 3056 } 3057 } 3058 3059 // allowOutOfWindowAck returns true if an out-of-window ACK can be sent now. 3060 func (e *endpoint) allowOutOfWindowAck() bool { 3061 now := e.stack.Clock().NowMonotonic() 3062 3063 if e.lastOutOfWindowAckTime != (tcpip.MonotonicTime{}) { 3064 var limit stack.TCPInvalidRateLimitOption 3065 if err := e.stack.Option(&limit); err != nil { 3066 panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err)) 3067 } 3068 if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) { 3069 return false 3070 } 3071 } 3072 3073 e.lastOutOfWindowAckTime = now 3074 return true 3075 } 3076 3077 // GetTCPReceiveBufferLimits is used to get send buffer size limits for TCP. 3078 func GetTCPReceiveBufferLimits(s tcpip.StackHandler) tcpip.ReceiveBufferSizeOption { 3079 var ss tcpip.TCPReceiveBufferSizeRangeOption 3080 if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil { 3081 panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err)) 3082 } 3083 3084 return tcpip.ReceiveBufferSizeOption{ 3085 Min: ss.Min, 3086 Default: ss.Default, 3087 Max: ss.Max, 3088 } 3089 }