github.com/vpnishe/netstack@v1.10.6/tcpip/transport/tcp/endpoint.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "encoding/binary" 19 "fmt" 20 "math" 21 "strings" 22 "sync" 23 "sync/atomic" 24 "time" 25 26 "github.com/vpnishe/netstack/rand" 27 "github.com/vpnishe/netstack/sleep" 28 "github.com/vpnishe/netstack/tcpip" 29 "github.com/vpnishe/netstack/tcpip/buffer" 30 "github.com/vpnishe/netstack/tcpip/hash/jenkins" 31 "github.com/vpnishe/netstack/tcpip/header" 32 "github.com/vpnishe/netstack/tcpip/iptables" 33 "github.com/vpnishe/netstack/tcpip/seqnum" 34 "github.com/vpnishe/netstack/tcpip/stack" 35 "github.com/vpnishe/netstack/tmutex" 36 "github.com/vpnishe/netstack/waiter" 37 ) 38 39 // EndpointState represents the state of a TCP endpoint. 40 type EndpointState uint32 41 42 // Endpoint states. Note that are represented in a netstack-specific manner and 43 // may not be meaningful externally. Specifically, they need to be translated to 44 // Linux's representation for these states if presented to userspace. 45 const ( 46 // Endpoint states internal to netstack. These map to the TCP state CLOSED. 47 StateInitial EndpointState = iota 48 StateBound 49 StateConnecting // Connect() called, but the initial SYN hasn't been sent. 50 StateError 51 52 // TCP protocol states. 53 StateEstablished 54 StateSynSent 55 StateSynRecv 56 StateFinWait1 57 StateFinWait2 58 StateTimeWait 59 StateClose 60 StateCloseWait 61 StateLastAck 62 StateListen 63 StateClosing 64 ) 65 66 // connected is the set of states where an endpoint is connected to a peer. 67 func (s EndpointState) connected() bool { 68 switch s { 69 case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing: 70 return true 71 default: 72 return false 73 } 74 } 75 76 // String implements fmt.Stringer.String. 77 func (s EndpointState) String() string { 78 switch s { 79 case StateInitial: 80 return "INITIAL" 81 case StateBound: 82 return "BOUND" 83 case StateConnecting: 84 return "CONNECTING" 85 case StateError: 86 return "ERROR" 87 case StateEstablished: 88 return "ESTABLISHED" 89 case StateSynSent: 90 return "SYN-SENT" 91 case StateSynRecv: 92 return "SYN-RCVD" 93 case StateFinWait1: 94 return "FIN-WAIT1" 95 case StateFinWait2: 96 return "FIN-WAIT2" 97 case StateTimeWait: 98 return "TIME-WAIT" 99 case StateClose: 100 return "CLOSED" 101 case StateCloseWait: 102 return "CLOSE-WAIT" 103 case StateLastAck: 104 return "LAST-ACK" 105 case StateListen: 106 return "LISTEN" 107 case StateClosing: 108 return "CLOSING" 109 default: 110 panic("unreachable") 111 } 112 } 113 114 // Reasons for notifying the protocol goroutine. 115 const ( 116 notifyNonZeroReceiveWindow = 1 << iota 117 notifyReceiveWindowChanged 118 notifyClose 119 notifyMTUChanged 120 notifyDrain 121 notifyReset 122 notifyKeepaliveChanged 123 notifyMSSChanged 124 // notifyTickleWorker is used to tickle the protocol main loop during a 125 // restore after we update the endpoint state to the correct one. This 126 // ensures the loop terminates if the final state of the endpoint is 127 // say TIME_WAIT. 128 notifyTickleWorker 129 ) 130 131 // SACKInfo holds TCP SACK related information for a given endpoint. 132 // 133 // +stateify savable 134 type SACKInfo struct { 135 // Blocks is the maximum number of SACK blocks we track 136 // per endpoint. 137 Blocks [MaxSACKBlocks]header.SACKBlock 138 139 // NumBlocks is the number of valid SACK blocks stored in the 140 // blocks array above. 141 NumBlocks int 142 } 143 144 // rcvBufAutoTuneParams are used to hold state variables to compute 145 // the auto tuned recv buffer size. 146 // 147 // +stateify savable 148 type rcvBufAutoTuneParams struct { 149 // measureTime is the time at which the current measurement 150 // was started. 151 measureTime time.Time 152 153 // copied is the number of bytes copied out of the receive 154 // buffers since this measure began. 155 copied int 156 157 // prevCopied is the number of bytes copied out of the receive 158 // buffers in the previous RTT period. 159 prevCopied int 160 161 // rtt is the non-smoothed minimum RTT as measured by observing the time 162 // between when a byte is first acknowledged and the receipt of data 163 // that is at least one window beyond the sequence number that was 164 // acknowledged. 165 rtt time.Duration 166 167 // rttMeasureSeqNumber is the highest acceptable sequence number at the 168 // time this RTT measurement period began. 169 rttMeasureSeqNumber seqnum.Value 170 171 // rttMeasureTime is the absolute time at which the current rtt 172 // measurement period began. 173 rttMeasureTime time.Time 174 175 // disabled is true if an explicit receive buffer is set for the 176 // endpoint. 177 disabled bool 178 } 179 180 // ReceiveErrors collect segment receive errors within transport layer. 181 type ReceiveErrors struct { 182 tcpip.ReceiveErrors 183 184 // SegmentQueueDropped is the number of segments dropped due to 185 // a full segment queue. 186 SegmentQueueDropped tcpip.StatCounter 187 188 // ChecksumErrors is the number of segments dropped due to bad checksums. 189 ChecksumErrors tcpip.StatCounter 190 191 // ListenOverflowSynDrop is the number of times the listen queue overflowed 192 // and a SYN was dropped. 193 ListenOverflowSynDrop tcpip.StatCounter 194 195 // ListenOverflowAckDrop is the number of times the final ACK 196 // in the handshake was dropped due to overflow. 197 ListenOverflowAckDrop tcpip.StatCounter 198 199 // ZeroRcvWindowState is the number of times we advertised 200 // a zero receive window when rcvList is full. 201 ZeroRcvWindowState tcpip.StatCounter 202 } 203 204 // SendErrors collect segment send errors within the transport layer. 205 type SendErrors struct { 206 tcpip.SendErrors 207 208 // SegmentSendToNetworkFailed is the number of TCP segments failed to be sent 209 // to the network endpoint. 210 SegmentSendToNetworkFailed tcpip.StatCounter 211 212 // SynSendToNetworkFailed is the number of TCP SYNs failed to be sent 213 // to the network endpoint. 214 SynSendToNetworkFailed tcpip.StatCounter 215 216 // Retransmits is the number of TCP segments retransmitted. 217 Retransmits tcpip.StatCounter 218 219 // FastRetransmit is the number of segments retransmitted in fast 220 // recovery. 221 FastRetransmit tcpip.StatCounter 222 223 // Timeouts is the number of times the RTO expired. 224 Timeouts tcpip.StatCounter 225 } 226 227 // Stats holds statistics about the endpoint. 228 type Stats struct { 229 // SegmentsReceived is the number of TCP segments received that 230 // the transport layer successfully parsed. 231 SegmentsReceived tcpip.StatCounter 232 233 // SegmentsSent is the number of TCP segments sent. 234 SegmentsSent tcpip.StatCounter 235 236 // FailedConnectionAttempts is the number of times we saw Connect and 237 // Accept errors. 238 FailedConnectionAttempts tcpip.StatCounter 239 240 // ReceiveErrors collects segment receive errors within the 241 // transport layer. 242 ReceiveErrors ReceiveErrors 243 244 // ReadErrors collects segment read errors from an endpoint read call. 245 ReadErrors tcpip.ReadErrors 246 247 // SendErrors collects segment send errors within the transport layer. 248 SendErrors SendErrors 249 250 // WriteErrors collects segment write errors from an endpoint write call. 251 WriteErrors tcpip.WriteErrors 252 } 253 254 // IsEndpointStats is an empty method to implement the tcpip.EndpointStats 255 // marker interface. 256 func (*Stats) IsEndpointStats() {} 257 258 // EndpointInfo holds useful information about a transport endpoint which 259 // can be queried by monitoring tools. 260 // 261 // +stateify savable 262 type EndpointInfo struct { 263 stack.TransportEndpointInfo 264 265 // HardError is meaningful only when state is stateError. It stores the 266 // error to be returned when read/write syscalls are called and the 267 // endpoint is in this state. HardError is protected by endpoint mu. 268 HardError *tcpip.Error 269 } 270 271 // IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo 272 // marker interface. 273 func (*EndpointInfo) IsEndpointInfo() {} 274 275 // endpoint represents a TCP endpoint. This struct serves as the interface 276 // between users of the endpoint and the protocol implementation; it is legal to 277 // have concurrent goroutines make calls into the endpoint, they are properly 278 // synchronized. The protocol implementation, however, runs in a single 279 // goroutine. 280 // 281 // +stateify savable 282 type endpoint struct { 283 EndpointInfo 284 285 // workMu is used to arbitrate which goroutine may perform protocol 286 // work. Only the main protocol goroutine is expected to call Lock() on 287 // it, but other goroutines (e.g., send) may call TryLock() to eagerly 288 // perform work without having to wait for the main one to wake up. 289 workMu tmutex.Mutex 290 291 // The following fields are initialized at creation time and do not 292 // change throughout the lifetime of the endpoint. 293 stack *stack.Stack 294 waiterQueue *waiter.Queue 295 uniqueID uint64 296 297 // lastError represents the last error that the endpoint reported; 298 // access to it is protected by the following mutex. 299 lastErrorMu sync.Mutex 300 lastError *tcpip.Error 301 302 // The following fields are used to manage the receive queue. The 303 // protocol goroutine adds ready-for-delivery segments to rcvList, 304 // which are returned by Read() calls to users. 305 // 306 // Once the peer has closed its send side, rcvClosed is set to true 307 // to indicate to users that no more data is coming. 308 // 309 // rcvListMu can be taken after the endpoint mu below. 310 rcvListMu sync.Mutex 311 rcvList segmentList 312 rcvClosed bool 313 rcvBufSize int 314 rcvBufUsed int 315 rcvAutoParams rcvBufAutoTuneParams 316 // zeroWindow indicates that the window was closed due to receive buffer 317 // space being filled up. This is set by the worker goroutine before 318 // moving a segment to the rcvList. This setting is cleared by the 319 // endpoint when a Read() call reads enough data for the new window to 320 // be non-zero. 321 zeroWindow bool 322 323 // The following fields are protected by the mutex. 324 mu sync.RWMutex 325 326 state EndpointState 327 328 // origEndpointState is only used during a restore phase to save the 329 // endpoint state at restore time as the socket is moved to it's correct 330 // state. 331 origEndpointState EndpointState 332 333 isPortReserved bool 334 isRegistered bool 335 boundNICID tcpip.NICID 336 route stack.Route 337 ttl uint8 338 v6only bool 339 isConnectNotified bool 340 // TCP should never broadcast but Linux nevertheless supports enabling/ 341 // disabling SO_BROADCAST, albeit as a NOOP. 342 broadcast bool 343 344 // effectiveNetProtos contains the network protocols actually in use. In 345 // most cases it will only contain "netProto", but in cases like IPv6 346 // endpoints with v6only set to false, this could include multiple 347 // protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g., 348 // IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped 349 // address). 350 effectiveNetProtos []tcpip.NetworkProtocolNumber 351 352 // workerRunning specifies if a worker goroutine is running. 353 workerRunning bool 354 355 // workerCleanup specifies if the worker goroutine must perform cleanup 356 // before exitting. This can only be set to true when workerRunning is 357 // also true, and they're both protected by the mutex. 358 workerCleanup bool 359 360 // sendTSOk is used to indicate when the TS Option has been negotiated. 361 // When sendTSOk is true every non-RST segment should carry a TS as per 362 // RFC7323#section-1.1 363 sendTSOk bool 364 365 // recentTS is the timestamp that should be sent in the TSEcr field of 366 // the timestamp for future segments sent by the endpoint. This field is 367 // updated if required when a new segment is received by this endpoint. 368 recentTS uint32 369 370 // tsOffset is a randomized offset added to the value of the 371 // TSVal field in the timestamp option. 372 tsOffset uint32 373 374 // shutdownFlags represent the current shutdown state of the endpoint. 375 shutdownFlags tcpip.ShutdownFlags 376 377 // sackPermitted is set to true if the peer sends the TCPSACKPermitted 378 // option in the SYN/SYN-ACK. 379 sackPermitted bool 380 381 // sack holds TCP SACK related information for this endpoint. 382 sack SACKInfo 383 384 // reusePort is set to true if SO_REUSEPORT is enabled. 385 reusePort bool 386 387 // bindToDevice is set to the NIC on which to bind or disabled if 0. 388 bindToDevice tcpip.NICID 389 390 // delay enables Nagle's algorithm. 391 // 392 // delay is a boolean (0 is false) and must be accessed atomically. 393 delay uint32 394 395 // cork holds back segments until full. 396 // 397 // cork is a boolean (0 is false) and must be accessed atomically. 398 cork uint32 399 400 // scoreboard holds TCP SACK Scoreboard information for this endpoint. 401 scoreboard *SACKScoreboard 402 403 // The options below aren't implemented, but we remember the user 404 // settings because applications expect to be able to set/query these 405 // options. 406 reuseAddr bool 407 408 // slowAck holds the negated state of quick ack. It is stubbed out and 409 // does nothing. 410 // 411 // slowAck is a boolean (0 is false) and must be accessed atomically. 412 slowAck uint32 413 414 // segmentQueue is used to hand received segments to the protocol 415 // goroutine. Segments are queued as long as the queue is not full, 416 // and dropped when it is. 417 segmentQueue segmentQueue 418 419 // synRcvdCount is the number of connections for this endpoint that are 420 // in SYN-RCVD state. 421 synRcvdCount int 422 423 // userMSS if non-zero is the MSS value explicitly set by the user 424 // for this endpoint using the TCP_MAXSEG setsockopt. 425 userMSS uint16 426 427 // The following fields are used to manage the send buffer. When 428 // segments are ready to be sent, they are added to sndQueue and the 429 // protocol goroutine is signaled via sndWaker. 430 // 431 // When the send side is closed, the protocol goroutine is notified via 432 // sndCloseWaker, and sndClosed is set to true. 433 sndBufMu sync.Mutex 434 sndBufSize int 435 sndBufUsed int 436 sndClosed bool 437 sndBufInQueue seqnum.Size 438 sndQueue segmentList 439 sndWaker sleep.Waker 440 sndCloseWaker sleep.Waker 441 442 // cc stores the name of the Congestion Control algorithm to use for 443 // this endpoint. 444 cc tcpip.CongestionControlOption 445 446 // The following are used when a "packet too big" control packet is 447 // received. They are protected by sndBufMu. They are used to 448 // communicate to the main protocol goroutine how many such control 449 // messages have been received since the last notification was processed 450 // and what was the smallest MTU seen. 451 packetTooBigCount int 452 sndMTU int 453 454 // newSegmentWaker is used to indicate to the protocol goroutine that 455 // it needs to wake up and handle new segments queued to it. 456 newSegmentWaker sleep.Waker 457 458 // notificationWaker is used to indicate to the protocol goroutine that 459 // it needs to wake up and check for notifications. 460 notificationWaker sleep.Waker 461 462 // notifyFlags is a bitmask of flags used to indicate to the protocol 463 // goroutine what it was notified; this is only accessed atomically. 464 notifyFlags uint32 465 466 // keepalive manages TCP keepalive state. When the connection is idle 467 // (no data sent or received) for keepaliveIdle, we start sending 468 // keepalives every keepalive.interval. If we send keepalive.count 469 // without hearing a response, the connection is closed. 470 keepalive keepalive 471 472 // pendingAccepted is a synchronization primitive used to track number 473 // of connections that are queued up to be delivered to the accepted 474 // channel. We use this to ensure that all goroutines blocked on writing 475 // to the acceptedChan below terminate before we close acceptedChan. 476 pendingAccepted sync.WaitGroup 477 478 // acceptedChan is used by a listening endpoint protocol goroutine to 479 // send newly accepted connections to the endpoint so that they can be 480 // read by Accept() calls. 481 acceptedChan chan *endpoint 482 483 // The following are only used from the protocol goroutine, and 484 // therefore don't need locks to protect them. 485 rcv *receiver 486 snd *sender 487 488 // The goroutine drain completion notification channel. 489 drainDone chan struct{} 490 491 // The goroutine undrain notification channel. This is currently used as 492 // a way to block the worker goroutines. Today nothing closes/writes 493 // this channel and this causes any goroutines waiting on this to just 494 // block. This is used during save/restore to prevent worker goroutines 495 // from mutating state as it's being saved. 496 undrain chan struct{} 497 498 // probe if not nil is invoked on every received segment. It is passed 499 // a copy of the current state of the endpoint. 500 probe stack.TCPProbeFunc 501 502 // The following are only used to assist the restore run to re-connect. 503 connectingAddress tcpip.Address 504 505 // amss is the advertised MSS to the peer by this endpoint. 506 amss uint16 507 508 // sendTOS represents IPv4 TOS or IPv6 TrafficClass, 509 // applied while sending packets. Defaults to 0 as on Linux. 510 sendTOS uint8 511 512 gso *stack.GSO 513 514 // TODO(b/142022063): Add ability to save and restore per endpoint stats. 515 stats Stats 516 517 // tcpLingerTimeout is the maximum amount of a time a socket 518 // a socket stays in TIME_WAIT state before being marked 519 // closed. 520 tcpLingerTimeout time.Duration 521 522 // closed indicates that the user has called closed on the 523 // endpoint and at this point the endpoint is only around 524 // to complete the TCP shutdown. 525 closed bool 526 } 527 528 // UniqueID implements stack.TransportEndpoint.UniqueID. 529 func (e *endpoint) UniqueID() uint64 { 530 return e.uniqueID 531 } 532 533 // calculateAdvertisedMSS calculates the MSS to advertise. 534 // 535 // If userMSS is non-zero and is not greater than the maximum possible MSS for 536 // r, it will be used; otherwise, the maximum possible MSS will be used. 537 func calculateAdvertisedMSS(userMSS uint16, r stack.Route) uint16 { 538 // The maximum possible MSS is dependent on the route. 539 maxMSS := mssForRoute(&r) 540 541 if userMSS != 0 && userMSS < maxMSS { 542 return userMSS 543 } 544 545 return maxMSS 546 } 547 548 // StopWork halts packet processing. Only to be used in tests. 549 func (e *endpoint) StopWork() { 550 e.workMu.Lock() 551 } 552 553 // ResumeWork resumes packet processing. Only to be used in tests. 554 func (e *endpoint) ResumeWork() { 555 e.workMu.Unlock() 556 } 557 558 // keepalive is a synchronization wrapper used to appease stateify. See the 559 // comment in endpoint, where it is used. 560 // 561 // +stateify savable 562 type keepalive struct { 563 sync.Mutex 564 enabled bool 565 idle time.Duration 566 interval time.Duration 567 count int 568 unacked int 569 timer timer 570 waker sleep.Waker 571 } 572 573 func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint { 574 e := &endpoint{ 575 stack: s, 576 EndpointInfo: EndpointInfo{ 577 TransportEndpointInfo: stack.TransportEndpointInfo{ 578 NetProto: netProto, 579 TransProto: header.TCPProtocolNumber, 580 }, 581 }, 582 waiterQueue: waiterQueue, 583 state: StateInitial, 584 rcvBufSize: DefaultReceiveBufferSize, 585 sndBufSize: DefaultSendBufferSize, 586 sndMTU: int(math.MaxInt32), 587 reuseAddr: true, 588 keepalive: keepalive{ 589 // Linux defaults. 590 idle: 2 * time.Hour, 591 interval: 75 * time.Second, 592 count: 9, 593 }, 594 uniqueID: s.UniqueID(), 595 } 596 597 var ss SendBufferSizeOption 598 if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil { 599 e.sndBufSize = ss.Default 600 } 601 602 var rs ReceiveBufferSizeOption 603 if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil { 604 e.rcvBufSize = rs.Default 605 } 606 607 var cs tcpip.CongestionControlOption 608 if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil { 609 e.cc = cs 610 } 611 612 var mrb tcpip.ModerateReceiveBufferOption 613 if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil { 614 e.rcvAutoParams.disabled = !bool(mrb) 615 } 616 617 var de DelayEnabled 618 if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de { 619 e.SetSockOptInt(tcpip.DelayOption, 1) 620 } 621 622 var tcpLT tcpip.TCPLingerTimeoutOption 623 if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil { 624 e.tcpLingerTimeout = time.Duration(tcpLT) 625 } 626 627 if p := s.GetTCPProbe(); p != nil { 628 e.probe = p 629 } 630 631 e.segmentQueue.setLimit(MaxUnprocessedSegments) 632 e.workMu.Init() 633 e.workMu.Lock() 634 e.tsOffset = timeStampOffset() 635 636 return e 637 } 638 639 // Readiness returns the current readiness of the endpoint. For example, if 640 // waiter.EventIn is set, the endpoint is immediately readable. 641 func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { 642 result := waiter.EventMask(0) 643 644 e.mu.RLock() 645 defer e.mu.RUnlock() 646 647 switch e.state { 648 case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv: 649 // Ready for nothing. 650 651 case StateClose, StateError: 652 // Ready for anything. 653 result = mask 654 655 case StateListen: 656 // Check if there's anything in the accepted channel. 657 if (mask & waiter.EventIn) != 0 { 658 if len(e.acceptedChan) > 0 { 659 result |= waiter.EventIn 660 } 661 } 662 } 663 if e.state.connected() { 664 // Determine if the endpoint is writable if requested. 665 if (mask & waiter.EventOut) != 0 { 666 e.sndBufMu.Lock() 667 if e.sndClosed || e.sndBufUsed < e.sndBufSize { 668 result |= waiter.EventOut 669 } 670 e.sndBufMu.Unlock() 671 } 672 673 // Determine if the endpoint is readable if requested. 674 if (mask & waiter.EventIn) != 0 { 675 e.rcvListMu.Lock() 676 if e.rcvBufUsed > 0 || e.rcvClosed { 677 result |= waiter.EventIn 678 } 679 e.rcvListMu.Unlock() 680 } 681 } 682 683 return result 684 } 685 686 func (e *endpoint) fetchNotifications() uint32 { 687 return atomic.SwapUint32(&e.notifyFlags, 0) 688 } 689 690 func (e *endpoint) notifyProtocolGoroutine(n uint32) { 691 for { 692 v := atomic.LoadUint32(&e.notifyFlags) 693 if v&n == n { 694 // The flags are already set. 695 return 696 } 697 698 if atomic.CompareAndSwapUint32(&e.notifyFlags, v, v|n) { 699 if v == 0 { 700 // We are causing a transition from no flags to 701 // at least one flag set, so we must cause the 702 // protocol goroutine to wake up. 703 e.notificationWaker.Assert() 704 } 705 return 706 } 707 } 708 } 709 710 // Close puts the endpoint in a closed state and frees all resources associated 711 // with it. It must be called only once and with no other concurrent calls to 712 // the endpoint. 713 func (e *endpoint) Close() { 714 e.mu.Lock() 715 closed := e.closed 716 e.mu.Unlock() 717 if closed { 718 return 719 } 720 721 // Issue a shutdown so that the peer knows we won't send any more data 722 // if we're connected, or stop accepting if we're listening. 723 e.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead) 724 725 e.mu.Lock() 726 727 // For listening sockets, we always release ports inline so that they 728 // are immediately available for reuse after Close() is called. If also 729 // registered, we unregister as well otherwise the next user would fail 730 // in Listen() when trying to register. 731 if e.state == StateListen && e.isPortReserved { 732 if e.isRegistered { 733 e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.bindToDevice) 734 e.isRegistered = false 735 } 736 737 e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.bindToDevice) 738 e.isPortReserved = false 739 } 740 741 // Mark endpoint as closed. 742 e.closed = true 743 // Either perform the local cleanup or kick the worker to make sure it 744 // knows it needs to cleanup. 745 tcpip.AddDanglingEndpoint(e) 746 if !e.workerRunning { 747 e.cleanupLocked() 748 } else { 749 e.workerCleanup = true 750 e.notifyProtocolGoroutine(notifyClose) 751 } 752 753 e.mu.Unlock() 754 } 755 756 // closePendingAcceptableConnections closes all connections that have completed 757 // handshake but not yet been delivered to the application. 758 func (e *endpoint) closePendingAcceptableConnectionsLocked() { 759 done := make(chan struct{}) 760 // Spin a goroutine up as ranging on e.acceptedChan will just block when 761 // there are no more connections in the channel. Using a non-blocking 762 // select does not work as it can potentially select the default case 763 // even when there are pending writes but that are not yet written to 764 // the channel. 765 go func() { 766 defer close(done) 767 for n := range e.acceptedChan { 768 n.notifyProtocolGoroutine(notifyReset) 769 n.Close() 770 } 771 }() 772 // pendingAccepted(see endpoint.deliverAccepted) tracks the number of 773 // endpoints which have completed handshake but are not yet written to 774 // the e.acceptedChan. We wait here till the goroutine above can drain 775 // all such connections from e.acceptedChan. 776 e.pendingAccepted.Wait() 777 close(e.acceptedChan) 778 <-done 779 e.acceptedChan = nil 780 } 781 782 // cleanupLocked frees all resources associated with the endpoint. It is called 783 // after Close() is called and the worker goroutine (if any) is done with its 784 // work. 785 func (e *endpoint) cleanupLocked() { 786 // Close all endpoints that might have been accepted by TCP but not by 787 // the client. 788 if e.acceptedChan != nil { 789 e.closePendingAcceptableConnectionsLocked() 790 } 791 e.workerCleanup = false 792 793 if e.isRegistered { 794 e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.bindToDevice) 795 e.isRegistered = false 796 } 797 798 if e.isPortReserved { 799 e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.bindToDevice) 800 e.isPortReserved = false 801 } 802 803 e.route.Release() 804 e.stack.CompleteTransportEndpointCleanup(e) 805 tcpip.DeleteDanglingEndpoint(e) 806 } 807 808 // initialReceiveWindow returns the initial receive window to advertise in the 809 // SYN/SYN-ACK. 810 func (e *endpoint) initialReceiveWindow() int { 811 rcvWnd := e.receiveBufferAvailable() 812 if rcvWnd > math.MaxUint16 { 813 rcvWnd = math.MaxUint16 814 } 815 816 // Use the user supplied MSS, if available. 817 routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2 818 if rcvWnd > routeWnd { 819 rcvWnd = routeWnd 820 } 821 return rcvWnd 822 } 823 824 // ModerateRecvBuf adjusts the receive buffer and the advertised window 825 // based on the number of bytes copied to user space. 826 func (e *endpoint) ModerateRecvBuf(copied int) { 827 e.rcvListMu.Lock() 828 if e.rcvAutoParams.disabled { 829 e.rcvListMu.Unlock() 830 return 831 } 832 now := time.Now() 833 if rtt := e.rcvAutoParams.rtt; rtt == 0 || now.Sub(e.rcvAutoParams.measureTime) < rtt { 834 e.rcvAutoParams.copied += copied 835 e.rcvListMu.Unlock() 836 return 837 } 838 prevRTTCopied := e.rcvAutoParams.copied + copied 839 prevCopied := e.rcvAutoParams.prevCopied 840 rcvWnd := 0 841 if prevRTTCopied > prevCopied { 842 // The minimal receive window based on what was copied by the app 843 // in the immediate preceding RTT and some extra buffer for 16 844 // segments to account for variations. 845 // We multiply by 2 to account for packet losses. 846 rcvWnd = prevRTTCopied*2 + 16*int(e.amss) 847 848 // Scale for slow start based on bytes copied in this RTT vs previous. 849 grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied 850 851 // Multiply growth factor by 2 again to account for sender being 852 // in slow-start where the sender grows it's congestion window 853 // by 100% per RTT. 854 rcvWnd += grow * 2 855 856 // Make sure auto tuned buffer size can always receive upto 2x 857 // the initial window of 10 segments. 858 if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd { 859 rcvWnd = minRcvWnd 860 } 861 862 // Cap the auto tuned buffer size by the maximum permissible 863 // receive buffer size. 864 if max := e.maxReceiveBufferSize(); rcvWnd > max { 865 rcvWnd = max 866 } 867 868 // We do not adjust downwards as that can cause the receiver to 869 // reject valid data that might already be in flight as the 870 // acceptable window will shrink. 871 if rcvWnd > e.rcvBufSize { 872 e.rcvBufSize = rcvWnd 873 e.notifyProtocolGoroutine(notifyReceiveWindowChanged) 874 } 875 876 // We only update prevCopied when we grow the buffer because in cases 877 // where prevCopied > prevRTTCopied the existing buffer is already big 878 // enough to handle the current rate and we don't need to do any 879 // adjustments. 880 e.rcvAutoParams.prevCopied = prevRTTCopied 881 } 882 e.rcvAutoParams.measureTime = now 883 e.rcvAutoParams.copied = 0 884 e.rcvListMu.Unlock() 885 } 886 887 // IPTables implements tcpip.Endpoint.IPTables. 888 func (e *endpoint) IPTables() (iptables.IPTables, error) { 889 return e.stack.IPTables(), nil 890 } 891 892 // Read reads data from the endpoint. 893 func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) { 894 e.mu.RLock() 895 // The endpoint can be read if it's connected, or if it's already closed 896 // but has some pending unread data. Also note that a RST being received 897 // would cause the state to become StateError so we should allow the 898 // reads to proceed before returning a ECONNRESET. 899 e.rcvListMu.Lock() 900 bufUsed := e.rcvBufUsed 901 if s := e.state; !s.connected() && s != StateClose && bufUsed == 0 { 902 e.rcvListMu.Unlock() 903 he := e.HardError 904 e.mu.RUnlock() 905 if s == StateError { 906 return buffer.View{}, tcpip.ControlMessages{}, he 907 } 908 e.stats.ReadErrors.InvalidEndpointState.Increment() 909 return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState 910 } 911 912 v, err := e.readLocked() 913 e.rcvListMu.Unlock() 914 915 e.mu.RUnlock() 916 917 if err == tcpip.ErrClosedForReceive { 918 e.stats.ReadErrors.ReadClosed.Increment() 919 } 920 return v, tcpip.ControlMessages{}, err 921 } 922 923 func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) { 924 if e.rcvBufUsed == 0 { 925 if e.rcvClosed || !e.state.connected() { 926 return buffer.View{}, tcpip.ErrClosedForReceive 927 } 928 return buffer.View{}, tcpip.ErrWouldBlock 929 } 930 931 s := e.rcvList.Front() 932 views := s.data.Views() 933 v := views[s.viewToDeliver] 934 s.viewToDeliver++ 935 936 if s.viewToDeliver >= len(views) { 937 e.rcvList.Remove(s) 938 s.decRef() 939 } 940 941 e.rcvBufUsed -= len(v) 942 // If the window was zero before this read and if the read freed up 943 // enough buffer space for the scaled window to be non-zero then notify 944 // the protocol goroutine to send a window update. 945 if e.zeroWindow && !e.zeroReceiveWindow(e.rcv.rcvWndScale) { 946 e.zeroWindow = false 947 e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow) 948 } 949 950 return v, nil 951 } 952 953 // isEndpointWritableLocked checks if a given endpoint is writable 954 // and also returns the number of bytes that can be written at this 955 // moment. If the endpoint is not writable then it returns an error 956 // indicating the reason why it's not writable. 957 // Caller must hold e.mu and e.sndBufMu 958 func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) { 959 // The endpoint cannot be written to if it's not connected. 960 if !e.state.connected() { 961 switch e.state { 962 case StateError: 963 return 0, e.HardError 964 default: 965 return 0, tcpip.ErrClosedForSend 966 } 967 } 968 969 // Check if the connection has already been closed for sends. 970 if e.sndClosed { 971 return 0, tcpip.ErrClosedForSend 972 } 973 974 avail := e.sndBufSize - e.sndBufUsed 975 if avail <= 0 { 976 return 0, tcpip.ErrWouldBlock 977 } 978 return avail, nil 979 } 980 981 // Write writes data to the endpoint's peer. 982 func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) { 983 // Linux completely ignores any address passed to sendto(2) for TCP sockets 984 // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More 985 // and opts.EndOfRecord are also ignored. 986 987 e.mu.RLock() 988 e.sndBufMu.Lock() 989 990 avail, err := e.isEndpointWritableLocked() 991 if err != nil { 992 e.sndBufMu.Unlock() 993 e.mu.RUnlock() 994 e.stats.WriteErrors.WriteClosed.Increment() 995 return 0, nil, err 996 } 997 998 // We can release locks while copying data. 999 // 1000 // This is not possible if atomic is set, because we can't allow the 1001 // available buffer space to be consumed by some other caller while we 1002 // are copying data in. 1003 if !opts.Atomic { 1004 e.sndBufMu.Unlock() 1005 e.mu.RUnlock() 1006 } 1007 1008 // Fetch data. 1009 v, perr := p.Payload(avail) 1010 if perr != nil || len(v) == 0 { 1011 if opts.Atomic { // See above. 1012 e.sndBufMu.Unlock() 1013 e.mu.RUnlock() 1014 } 1015 // Note that perr may be nil if len(v) == 0. 1016 return 0, nil, perr 1017 } 1018 1019 if !opts.Atomic { // See above. 1020 e.mu.RLock() 1021 e.sndBufMu.Lock() 1022 1023 // Because we released the lock before copying, check state again 1024 // to make sure the endpoint is still in a valid state for a write. 1025 avail, err = e.isEndpointWritableLocked() 1026 if err != nil { 1027 e.sndBufMu.Unlock() 1028 e.mu.RUnlock() 1029 e.stats.WriteErrors.WriteClosed.Increment() 1030 return 0, nil, err 1031 } 1032 1033 // Discard any excess data copied in due to avail being reduced due 1034 // to a simultaneous write call to the socket. 1035 if avail < len(v) { 1036 v = v[:avail] 1037 } 1038 } 1039 1040 // Add data to the send queue. 1041 s := newSegmentFromView(&e.route, e.ID, v) 1042 e.sndBufUsed += len(v) 1043 e.sndBufInQueue += seqnum.Size(len(v)) 1044 e.sndQueue.PushBack(s) 1045 e.sndBufMu.Unlock() 1046 // Release the endpoint lock to prevent deadlocks due to lock 1047 // order inversion when acquiring workMu. 1048 e.mu.RUnlock() 1049 1050 if e.workMu.TryLock() { 1051 // Do the work inline. 1052 e.handleWrite() 1053 e.workMu.Unlock() 1054 } else { 1055 // Let the protocol goroutine do the work. 1056 e.sndWaker.Assert() 1057 } 1058 1059 return int64(len(v)), nil, nil 1060 } 1061 1062 // Peek reads data without consuming it from the endpoint. 1063 // 1064 // This method does not block if there is no data pending. 1065 func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) { 1066 e.mu.RLock() 1067 defer e.mu.RUnlock() 1068 1069 // The endpoint can be read if it's connected, or if it's already closed 1070 // but has some pending unread data. 1071 if s := e.state; !s.connected() && s != StateClose { 1072 if s == StateError { 1073 return 0, tcpip.ControlMessages{}, e.HardError 1074 } 1075 e.stats.ReadErrors.InvalidEndpointState.Increment() 1076 return 0, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState 1077 } 1078 1079 e.rcvListMu.Lock() 1080 defer e.rcvListMu.Unlock() 1081 1082 if e.rcvBufUsed == 0 { 1083 if e.rcvClosed || !e.state.connected() { 1084 e.stats.ReadErrors.ReadClosed.Increment() 1085 return 0, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive 1086 } 1087 return 0, tcpip.ControlMessages{}, tcpip.ErrWouldBlock 1088 } 1089 1090 // Make a copy of vec so we can modify the slide headers. 1091 vec = append([][]byte(nil), vec...) 1092 1093 var num int64 1094 for s := e.rcvList.Front(); s != nil; s = s.Next() { 1095 views := s.data.Views() 1096 1097 for i := s.viewToDeliver; i < len(views); i++ { 1098 v := views[i] 1099 1100 for len(v) > 0 { 1101 if len(vec) == 0 { 1102 return num, tcpip.ControlMessages{}, nil 1103 } 1104 if len(vec[0]) == 0 { 1105 vec = vec[1:] 1106 continue 1107 } 1108 1109 n := copy(vec[0], v) 1110 v = v[n:] 1111 vec[0] = vec[0][n:] 1112 num += int64(n) 1113 } 1114 } 1115 } 1116 1117 return num, tcpip.ControlMessages{}, nil 1118 } 1119 1120 // zeroReceiveWindow checks if the receive window to be announced now would be 1121 // zero, based on the amount of available buffer and the receive window scaling. 1122 // 1123 // It must be called with rcvListMu held. 1124 func (e *endpoint) zeroReceiveWindow(scale uint8) bool { 1125 if e.rcvBufUsed >= e.rcvBufSize { 1126 return true 1127 } 1128 1129 return ((e.rcvBufSize - e.rcvBufUsed) >> scale) == 0 1130 } 1131 1132 // SetSockOptInt sets a socket option. 1133 func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error { 1134 switch opt { 1135 case tcpip.ReceiveBufferSizeOption: 1136 // Make sure the receive buffer size is within the min and max 1137 // allowed. 1138 var rs ReceiveBufferSizeOption 1139 size := int(v) 1140 if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil { 1141 if size < rs.Min { 1142 size = rs.Min 1143 } 1144 if size > rs.Max { 1145 size = rs.Max 1146 } 1147 } 1148 1149 mask := uint32(notifyReceiveWindowChanged) 1150 1151 e.rcvListMu.Lock() 1152 1153 // Make sure the receive buffer size allows us to send a 1154 // non-zero window size. 1155 scale := uint8(0) 1156 if e.rcv != nil { 1157 scale = e.rcv.rcvWndScale 1158 } 1159 if size>>scale == 0 { 1160 size = 1 << scale 1161 } 1162 1163 // Make sure 2*size doesn't overflow. 1164 if size > math.MaxInt32/2 { 1165 size = math.MaxInt32 / 2 1166 } 1167 1168 e.rcvBufSize = size 1169 e.rcvAutoParams.disabled = true 1170 if e.zeroWindow && !e.zeroReceiveWindow(scale) { 1171 e.zeroWindow = false 1172 mask |= notifyNonZeroReceiveWindow 1173 } 1174 e.rcvListMu.Unlock() 1175 1176 e.notifyProtocolGoroutine(mask) 1177 return nil 1178 1179 case tcpip.SendBufferSizeOption: 1180 // Make sure the send buffer size is within the min and max 1181 // allowed. 1182 size := int(v) 1183 var ss SendBufferSizeOption 1184 if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil { 1185 if size < ss.Min { 1186 size = ss.Min 1187 } 1188 if size > ss.Max { 1189 size = ss.Max 1190 } 1191 } 1192 1193 e.sndBufMu.Lock() 1194 e.sndBufSize = size 1195 e.sndBufMu.Unlock() 1196 return nil 1197 1198 case tcpip.DelayOption: 1199 if v == 0 { 1200 atomic.StoreUint32(&e.delay, 0) 1201 1202 // Handle delayed data. 1203 e.sndWaker.Assert() 1204 } else { 1205 atomic.StoreUint32(&e.delay, 1) 1206 } 1207 return nil 1208 1209 default: 1210 return nil 1211 } 1212 } 1213 1214 // SetSockOpt sets a socket option. 1215 func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { 1216 // Lower 2 bits represents ECN bits. RFC 3168, section 23.1 1217 const inetECNMask = 3 1218 switch v := opt.(type) { 1219 case tcpip.CorkOption: 1220 if v == 0 { 1221 atomic.StoreUint32(&e.cork, 0) 1222 1223 // Handle the corked data. 1224 e.sndWaker.Assert() 1225 } else { 1226 atomic.StoreUint32(&e.cork, 1) 1227 } 1228 return nil 1229 1230 case tcpip.ReuseAddressOption: 1231 e.mu.Lock() 1232 e.reuseAddr = v != 0 1233 e.mu.Unlock() 1234 return nil 1235 1236 case tcpip.ReusePortOption: 1237 e.mu.Lock() 1238 e.reusePort = v != 0 1239 e.mu.Unlock() 1240 return nil 1241 1242 case tcpip.BindToDeviceOption: 1243 e.mu.Lock() 1244 defer e.mu.Unlock() 1245 if v == "" { 1246 e.bindToDevice = 0 1247 return nil 1248 } 1249 for nicID, nic := range e.stack.NICInfo() { 1250 if nic.Name == string(v) { 1251 e.bindToDevice = nicID 1252 return nil 1253 } 1254 } 1255 return tcpip.ErrUnknownDevice 1256 1257 case tcpip.QuickAckOption: 1258 if v == 0 { 1259 atomic.StoreUint32(&e.slowAck, 1) 1260 } else { 1261 atomic.StoreUint32(&e.slowAck, 0) 1262 } 1263 return nil 1264 1265 case tcpip.MaxSegOption: 1266 userMSS := v 1267 if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS { 1268 return tcpip.ErrInvalidOptionValue 1269 } 1270 e.mu.Lock() 1271 e.userMSS = uint16(userMSS) 1272 e.mu.Unlock() 1273 e.notifyProtocolGoroutine(notifyMSSChanged) 1274 return nil 1275 1276 case tcpip.V6OnlyOption: 1277 // We only recognize this option on v6 endpoints. 1278 if e.NetProto != header.IPv6ProtocolNumber { 1279 return tcpip.ErrInvalidEndpointState 1280 } 1281 1282 e.mu.Lock() 1283 defer e.mu.Unlock() 1284 1285 // We only allow this to be set when we're in the initial state. 1286 if e.state != StateInitial { 1287 return tcpip.ErrInvalidEndpointState 1288 } 1289 1290 e.v6only = v != 0 1291 return nil 1292 1293 case tcpip.TTLOption: 1294 e.mu.Lock() 1295 e.ttl = uint8(v) 1296 e.mu.Unlock() 1297 return nil 1298 1299 case tcpip.KeepaliveEnabledOption: 1300 e.keepalive.Lock() 1301 e.keepalive.enabled = v != 0 1302 e.keepalive.Unlock() 1303 e.notifyProtocolGoroutine(notifyKeepaliveChanged) 1304 return nil 1305 1306 case tcpip.KeepaliveIdleOption: 1307 e.keepalive.Lock() 1308 e.keepalive.idle = time.Duration(v) 1309 e.keepalive.Unlock() 1310 e.notifyProtocolGoroutine(notifyKeepaliveChanged) 1311 return nil 1312 1313 case tcpip.KeepaliveIntervalOption: 1314 e.keepalive.Lock() 1315 e.keepalive.interval = time.Duration(v) 1316 e.keepalive.Unlock() 1317 e.notifyProtocolGoroutine(notifyKeepaliveChanged) 1318 return nil 1319 1320 case tcpip.KeepaliveCountOption: 1321 e.keepalive.Lock() 1322 e.keepalive.count = int(v) 1323 e.keepalive.Unlock() 1324 e.notifyProtocolGoroutine(notifyKeepaliveChanged) 1325 return nil 1326 1327 case tcpip.BroadcastOption: 1328 e.mu.Lock() 1329 e.broadcast = v != 0 1330 e.mu.Unlock() 1331 return nil 1332 1333 case tcpip.CongestionControlOption: 1334 // Query the available cc algorithms in the stack and 1335 // validate that the specified algorithm is actually 1336 // supported in the stack. 1337 var avail tcpip.AvailableCongestionControlOption 1338 if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil { 1339 return err 1340 } 1341 availCC := strings.Split(string(avail), " ") 1342 for _, cc := range availCC { 1343 if v == tcpip.CongestionControlOption(cc) { 1344 // Acquire the work mutex as we may need to 1345 // reinitialize the congestion control state. 1346 e.mu.Lock() 1347 state := e.state 1348 e.cc = v 1349 e.mu.Unlock() 1350 switch state { 1351 case StateEstablished: 1352 e.workMu.Lock() 1353 e.mu.Lock() 1354 if e.state == state { 1355 e.snd.cc = e.snd.initCongestionControl(e.cc) 1356 } 1357 e.mu.Unlock() 1358 e.workMu.Unlock() 1359 } 1360 return nil 1361 } 1362 } 1363 1364 // Linux returns ENOENT when an invalid congestion 1365 // control algorithm is specified. 1366 return tcpip.ErrNoSuchFile 1367 1368 case tcpip.IPv4TOSOption: 1369 e.mu.Lock() 1370 // TODO(gvisor.dev/issue/995): ECN is not currently supported, 1371 // ignore the bits for now. 1372 e.sendTOS = uint8(v) & ^uint8(inetECNMask) 1373 e.mu.Unlock() 1374 return nil 1375 1376 case tcpip.IPv6TrafficClassOption: 1377 e.mu.Lock() 1378 // TODO(gvisor.dev/issue/995): ECN is not currently supported, 1379 // ignore the bits for now. 1380 e.sendTOS = uint8(v) & ^uint8(inetECNMask) 1381 e.mu.Unlock() 1382 return nil 1383 1384 case tcpip.TCPLingerTimeoutOption: 1385 e.mu.Lock() 1386 if v < 0 { 1387 // Same as effectively disabling TCPLinger timeout. 1388 v = 0 1389 } 1390 var stkTCPLingerTimeout tcpip.TCPLingerTimeoutOption 1391 if err := e.stack.TransportProtocolOption(header.TCPProtocolNumber, &stkTCPLingerTimeout); err != nil { 1392 // We were unable to retrieve a stack config, just use 1393 // the DefaultTCPLingerTimeout. 1394 if v > tcpip.TCPLingerTimeoutOption(DefaultTCPLingerTimeout) { 1395 stkTCPLingerTimeout = tcpip.TCPLingerTimeoutOption(DefaultTCPLingerTimeout) 1396 } 1397 } 1398 // Cap it to the stack wide TCPLinger timeout. 1399 if v > stkTCPLingerTimeout { 1400 v = stkTCPLingerTimeout 1401 } 1402 e.tcpLingerTimeout = time.Duration(v) 1403 e.mu.Unlock() 1404 return nil 1405 1406 default: 1407 return nil 1408 } 1409 } 1410 1411 // readyReceiveSize returns the number of bytes ready to be received. 1412 func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) { 1413 e.mu.RLock() 1414 defer e.mu.RUnlock() 1415 1416 // The endpoint cannot be in listen state. 1417 if e.state == StateListen { 1418 return 0, tcpip.ErrInvalidEndpointState 1419 } 1420 1421 e.rcvListMu.Lock() 1422 defer e.rcvListMu.Unlock() 1423 1424 return e.rcvBufUsed, nil 1425 } 1426 1427 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. 1428 func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) { 1429 switch opt { 1430 case tcpip.ReceiveQueueSizeOption: 1431 return e.readyReceiveSize() 1432 1433 case tcpip.SendBufferSizeOption: 1434 e.sndBufMu.Lock() 1435 v := e.sndBufSize 1436 e.sndBufMu.Unlock() 1437 return v, nil 1438 1439 case tcpip.ReceiveBufferSizeOption: 1440 e.rcvListMu.Lock() 1441 v := e.rcvBufSize 1442 e.rcvListMu.Unlock() 1443 return v, nil 1444 1445 case tcpip.DelayOption: 1446 var o int 1447 if v := atomic.LoadUint32(&e.delay); v != 0 { 1448 o = 1 1449 } 1450 return o, nil 1451 1452 default: 1453 return -1, tcpip.ErrUnknownProtocolOption 1454 } 1455 } 1456 1457 // GetSockOpt implements tcpip.Endpoint.GetSockOpt. 1458 func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { 1459 switch o := opt.(type) { 1460 case tcpip.ErrorOption: 1461 e.lastErrorMu.Lock() 1462 err := e.lastError 1463 e.lastError = nil 1464 e.lastErrorMu.Unlock() 1465 return err 1466 1467 case *tcpip.MaxSegOption: 1468 // This is just stubbed out. Linux never returns the user_mss 1469 // value as it either returns the defaultMSS or returns the 1470 // actual current MSS. Netstack just returns the defaultMSS 1471 // always for now. 1472 *o = header.TCPDefaultMSS 1473 return nil 1474 1475 case *tcpip.CorkOption: 1476 *o = 0 1477 if v := atomic.LoadUint32(&e.cork); v != 0 { 1478 *o = 1 1479 } 1480 return nil 1481 1482 case *tcpip.ReuseAddressOption: 1483 e.mu.RLock() 1484 v := e.reuseAddr 1485 e.mu.RUnlock() 1486 1487 *o = 0 1488 if v { 1489 *o = 1 1490 } 1491 return nil 1492 1493 case *tcpip.ReusePortOption: 1494 e.mu.RLock() 1495 v := e.reusePort 1496 e.mu.RUnlock() 1497 1498 *o = 0 1499 if v { 1500 *o = 1 1501 } 1502 return nil 1503 1504 case *tcpip.BindToDeviceOption: 1505 e.mu.RLock() 1506 defer e.mu.RUnlock() 1507 if nic, ok := e.stack.NICInfo()[e.bindToDevice]; ok { 1508 *o = tcpip.BindToDeviceOption(nic.Name) 1509 return nil 1510 } 1511 *o = "" 1512 return nil 1513 1514 case *tcpip.QuickAckOption: 1515 *o = 1 1516 if v := atomic.LoadUint32(&e.slowAck); v != 0 { 1517 *o = 0 1518 } 1519 return nil 1520 1521 case *tcpip.V6OnlyOption: 1522 // We only recognize this option on v6 endpoints. 1523 if e.NetProto != header.IPv6ProtocolNumber { 1524 return tcpip.ErrUnknownProtocolOption 1525 } 1526 1527 e.mu.Lock() 1528 v := e.v6only 1529 e.mu.Unlock() 1530 1531 *o = 0 1532 if v { 1533 *o = 1 1534 } 1535 return nil 1536 1537 case *tcpip.TTLOption: 1538 e.mu.Lock() 1539 *o = tcpip.TTLOption(e.ttl) 1540 e.mu.Unlock() 1541 return nil 1542 1543 case *tcpip.TCPInfoOption: 1544 *o = tcpip.TCPInfoOption{} 1545 e.mu.RLock() 1546 snd := e.snd 1547 e.mu.RUnlock() 1548 if snd != nil { 1549 snd.rtt.Lock() 1550 o.RTT = snd.rtt.srtt 1551 o.RTTVar = snd.rtt.rttvar 1552 snd.rtt.Unlock() 1553 } 1554 return nil 1555 1556 case *tcpip.KeepaliveEnabledOption: 1557 e.keepalive.Lock() 1558 v := e.keepalive.enabled 1559 e.keepalive.Unlock() 1560 1561 *o = 0 1562 if v { 1563 *o = 1 1564 } 1565 return nil 1566 1567 case *tcpip.KeepaliveIdleOption: 1568 e.keepalive.Lock() 1569 *o = tcpip.KeepaliveIdleOption(e.keepalive.idle) 1570 e.keepalive.Unlock() 1571 return nil 1572 1573 case *tcpip.KeepaliveIntervalOption: 1574 e.keepalive.Lock() 1575 *o = tcpip.KeepaliveIntervalOption(e.keepalive.interval) 1576 e.keepalive.Unlock() 1577 return nil 1578 1579 case *tcpip.KeepaliveCountOption: 1580 e.keepalive.Lock() 1581 *o = tcpip.KeepaliveCountOption(e.keepalive.count) 1582 e.keepalive.Unlock() 1583 return nil 1584 1585 case *tcpip.OutOfBandInlineOption: 1586 // We don't currently support disabling this option. 1587 *o = 1 1588 return nil 1589 1590 case *tcpip.BroadcastOption: 1591 e.mu.Lock() 1592 v := e.broadcast 1593 e.mu.Unlock() 1594 1595 *o = 0 1596 if v { 1597 *o = 1 1598 } 1599 return nil 1600 1601 case *tcpip.CongestionControlOption: 1602 e.mu.Lock() 1603 *o = e.cc 1604 e.mu.Unlock() 1605 return nil 1606 1607 case *tcpip.IPv4TOSOption: 1608 e.mu.RLock() 1609 *o = tcpip.IPv4TOSOption(e.sendTOS) 1610 e.mu.RUnlock() 1611 return nil 1612 1613 case *tcpip.IPv6TrafficClassOption: 1614 e.mu.RLock() 1615 *o = tcpip.IPv6TrafficClassOption(e.sendTOS) 1616 e.mu.RUnlock() 1617 return nil 1618 1619 case *tcpip.TCPLingerTimeoutOption: 1620 e.mu.Lock() 1621 *o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout) 1622 e.mu.Unlock() 1623 return nil 1624 1625 default: 1626 return tcpip.ErrUnknownProtocolOption 1627 } 1628 } 1629 1630 func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) { 1631 netProto := e.NetProto 1632 if header.IsV4MappedAddress(addr.Addr) { 1633 // Fail if using a v4 mapped address on a v6only endpoint. 1634 if e.v6only { 1635 return 0, tcpip.ErrNoRoute 1636 } 1637 1638 netProto = header.IPv4ProtocolNumber 1639 addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:] 1640 if addr.Addr == header.IPv4Any { 1641 addr.Addr = "" 1642 } 1643 } 1644 1645 // Fail if we're bound to an address length different from the one we're 1646 // checking. 1647 if l := len(e.ID.LocalAddress); l != 0 && len(addr.Addr) != 0 && l != len(addr.Addr) { 1648 return 0, tcpip.ErrInvalidEndpointState 1649 } 1650 1651 return netProto, nil 1652 } 1653 1654 // Disconnect implements tcpip.Endpoint.Disconnect. 1655 func (*endpoint) Disconnect() *tcpip.Error { 1656 return tcpip.ErrNotSupported 1657 } 1658 1659 // Connect connects the endpoint to its peer. 1660 func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { 1661 err := e.connect(addr, true, true) 1662 if err != nil && !err.IgnoreStats() { 1663 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 1664 e.stats.FailedConnectionAttempts.Increment() 1665 } 1666 return err 1667 } 1668 1669 // connect connects the endpoint to its peer. In the normal non-S/R case, the 1670 // new connection is expected to run the main goroutine and perform handshake. 1671 // In restore of previously connected endpoints, both ends will be passively 1672 // created (so no new handshaking is done); for stack-accepted connections not 1673 // yet accepted by the app, they are restored without running the main goroutine 1674 // here. 1675 func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tcpip.Error { 1676 e.mu.Lock() 1677 defer e.mu.Unlock() 1678 1679 connectingAddr := addr.Addr 1680 1681 netProto, err := e.checkV4Mapped(&addr) 1682 if err != nil { 1683 return err 1684 } 1685 1686 if e.state.connected() { 1687 // The endpoint is already connected. If caller hasn't been 1688 // notified yet, return success. 1689 if !e.isConnectNotified { 1690 e.isConnectNotified = true 1691 return nil 1692 } 1693 // Otherwise return that it's already connected. 1694 return tcpip.ErrAlreadyConnected 1695 } 1696 1697 nicID := addr.NIC 1698 switch e.state { 1699 case StateBound: 1700 // If we're already bound to a NIC but the caller is requesting 1701 // that we use a different one now, we cannot proceed. 1702 if e.boundNICID == 0 { 1703 break 1704 } 1705 1706 if nicID != 0 && nicID != e.boundNICID { 1707 return tcpip.ErrNoRoute 1708 } 1709 1710 nicID = e.boundNICID 1711 1712 case StateInitial: 1713 // Nothing to do. We'll eventually fill-in the gaps in the ID (if any) 1714 // when we find a route. 1715 1716 case StateConnecting, StateSynSent, StateSynRecv: 1717 // A connection request has already been issued but hasn't completed 1718 // yet. 1719 return tcpip.ErrAlreadyConnecting 1720 1721 case StateError: 1722 return e.HardError 1723 1724 default: 1725 return tcpip.ErrInvalidEndpointState 1726 } 1727 1728 // Find a route to the desired destination. 1729 r, err := e.stack.FindRoute(nicID, e.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */) 1730 if err != nil { 1731 return err 1732 } 1733 defer r.Release() 1734 1735 origID := e.ID 1736 1737 netProtos := []tcpip.NetworkProtocolNumber{netProto} 1738 e.ID.LocalAddress = r.LocalAddress 1739 e.ID.RemoteAddress = r.RemoteAddress 1740 e.ID.RemotePort = addr.Port 1741 1742 if e.ID.LocalPort != 0 { 1743 // The endpoint is bound to a port, attempt to register it. 1744 err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, e.ID, e, e.reusePort, e.bindToDevice) 1745 if err != nil { 1746 return err 1747 } 1748 } else { 1749 // The endpoint doesn't have a local port yet, so try to get 1750 // one. Make sure that it isn't one that will result in the same 1751 // address/port for both local and remote (otherwise this 1752 // endpoint would be trying to connect to itself). 1753 sameAddr := e.ID.LocalAddress == e.ID.RemoteAddress 1754 1755 // Calculate a port offset based on the destination IP/port and 1756 // src IP to ensure that for a given tuple (srcIP, destIP, 1757 // destPort) the offset used as a starting point is the same to 1758 // ensure that we can cycle through the port space effectively. 1759 h := jenkins.Sum32(e.stack.Seed()) 1760 h.Write([]byte(e.ID.LocalAddress)) 1761 h.Write([]byte(e.ID.RemoteAddress)) 1762 portBuf := make([]byte, 2) 1763 binary.LittleEndian.PutUint16(portBuf, e.ID.RemotePort) 1764 h.Write(portBuf) 1765 portOffset := h.Sum32() 1766 1767 if _, err := e.stack.PickEphemeralPortStable(portOffset, func(p uint16) (bool, *tcpip.Error) { 1768 if sameAddr && p == e.ID.RemotePort { 1769 return false, nil 1770 } 1771 // reusePort is false below because connect cannot reuse a port even if 1772 // reusePort was set. 1773 if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.ID.LocalAddress, p, false /* reusePort */, e.bindToDevice) { 1774 return false, nil 1775 } 1776 1777 id := e.ID 1778 id.LocalPort = p 1779 switch e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice) { 1780 case nil: 1781 e.ID = id 1782 return true, nil 1783 case tcpip.ErrPortInUse: 1784 return false, nil 1785 default: 1786 return false, err 1787 } 1788 }); err != nil { 1789 return err 1790 } 1791 } 1792 1793 // Remove the port reservation. This can happen when Bind is called 1794 // before Connect: in such a case we don't want to hold on to 1795 // reservations anymore. 1796 if e.isPortReserved { 1797 e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.bindToDevice) 1798 e.isPortReserved = false 1799 } 1800 1801 e.isRegistered = true 1802 e.state = StateConnecting 1803 e.route = r.Clone() 1804 e.boundNICID = nicID 1805 e.effectiveNetProtos = netProtos 1806 e.connectingAddress = connectingAddr 1807 1808 e.initGSO() 1809 1810 // Connect in the restore phase does not perform handshake. Restore its 1811 // connection setting here. 1812 if !handshake { 1813 e.segmentQueue.mu.Lock() 1814 for _, l := range []segmentList{e.segmentQueue.list, e.sndQueue, e.snd.writeList} { 1815 for s := l.Front(); s != nil; s = s.Next() { 1816 s.id = e.ID 1817 s.route = r.Clone() 1818 e.sndWaker.Assert() 1819 } 1820 } 1821 e.segmentQueue.mu.Unlock() 1822 e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0) 1823 e.state = StateEstablished 1824 e.stack.Stats().TCP.CurrentEstablished.Increment() 1825 } 1826 1827 if run { 1828 e.workerRunning = true 1829 e.stack.Stats().TCP.ActiveConnectionOpenings.Increment() 1830 go e.protocolMainLoop(handshake) 1831 } 1832 1833 return tcpip.ErrConnectStarted 1834 } 1835 1836 // ConnectEndpoint is not supported. 1837 func (*endpoint) ConnectEndpoint(tcpip.Endpoint) *tcpip.Error { 1838 return tcpip.ErrInvalidEndpointState 1839 } 1840 1841 // Shutdown closes the read and/or write end of the endpoint connection to its 1842 // peer. 1843 func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error { 1844 e.mu.Lock() 1845 e.shutdownFlags |= flags 1846 finQueued := false 1847 switch { 1848 case e.state.connected(): 1849 // Close for read. 1850 if (e.shutdownFlags & tcpip.ShutdownRead) != 0 { 1851 // Mark read side as closed. 1852 e.rcvListMu.Lock() 1853 e.rcvClosed = true 1854 rcvBufUsed := e.rcvBufUsed 1855 e.rcvListMu.Unlock() 1856 1857 // If we're fully closed and we have unread data we need to abort 1858 // the connection with a RST. 1859 if (e.shutdownFlags&tcpip.ShutdownWrite) != 0 && rcvBufUsed > 0 { 1860 e.notifyProtocolGoroutine(notifyReset) 1861 e.mu.Unlock() 1862 return nil 1863 } 1864 } 1865 1866 // Close for write. 1867 if (e.shutdownFlags & tcpip.ShutdownWrite) != 0 { 1868 e.sndBufMu.Lock() 1869 1870 if e.sndClosed { 1871 // Already closed. 1872 e.sndBufMu.Unlock() 1873 break 1874 } 1875 1876 // Queue fin segment. 1877 s := newSegmentFromView(&e.route, e.ID, nil) 1878 e.sndQueue.PushBack(s) 1879 e.sndBufInQueue++ 1880 finQueued = true 1881 // Mark endpoint as closed. 1882 e.sndClosed = true 1883 1884 e.sndBufMu.Unlock() 1885 } 1886 1887 case e.state == StateListen: 1888 // Tell protocolListenLoop to stop. 1889 if flags&tcpip.ShutdownRead != 0 { 1890 e.notifyProtocolGoroutine(notifyClose) 1891 } 1892 default: 1893 e.mu.Unlock() 1894 return tcpip.ErrNotConnected 1895 } 1896 e.mu.Unlock() 1897 if finQueued { 1898 if e.workMu.TryLock() { 1899 e.handleClose() 1900 e.workMu.Unlock() 1901 } else { 1902 // Tell protocol goroutine to close. 1903 e.sndCloseWaker.Assert() 1904 } 1905 } 1906 return nil 1907 } 1908 1909 // Listen puts the endpoint in "listen" mode, which allows it to accept 1910 // new connections. 1911 func (e *endpoint) Listen(backlog int) *tcpip.Error { 1912 err := e.listen(backlog) 1913 if err != nil && !err.IgnoreStats() { 1914 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 1915 e.stats.FailedConnectionAttempts.Increment() 1916 } 1917 return err 1918 } 1919 1920 func (e *endpoint) listen(backlog int) *tcpip.Error { 1921 e.mu.Lock() 1922 defer e.mu.Unlock() 1923 1924 // Allow the backlog to be adjusted if the endpoint is not shutting down. 1925 // When the endpoint shuts down, it sets workerCleanup to true, and from 1926 // that point onward, acceptedChan is the responsibility of the cleanup() 1927 // method (and should not be touched anywhere else, including here). 1928 if e.state == StateListen && !e.workerCleanup { 1929 // Adjust the size of the channel iff we can fix existing 1930 // pending connections into the new one. 1931 if len(e.acceptedChan) > backlog { 1932 return tcpip.ErrInvalidEndpointState 1933 } 1934 if cap(e.acceptedChan) == backlog { 1935 return nil 1936 } 1937 origChan := e.acceptedChan 1938 e.acceptedChan = make(chan *endpoint, backlog) 1939 close(origChan) 1940 for ep := range origChan { 1941 e.acceptedChan <- ep 1942 } 1943 return nil 1944 } 1945 1946 // Endpoint must be bound before it can transition to listen mode. 1947 if e.state != StateBound { 1948 e.stats.ReadErrors.InvalidEndpointState.Increment() 1949 return tcpip.ErrInvalidEndpointState 1950 } 1951 1952 // Register the endpoint. 1953 if err := e.stack.RegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.reusePort, e.bindToDevice); err != nil { 1954 return err 1955 } 1956 1957 e.isRegistered = true 1958 e.state = StateListen 1959 if e.acceptedChan == nil { 1960 e.acceptedChan = make(chan *endpoint, backlog) 1961 } 1962 e.workerRunning = true 1963 1964 go e.protocolListenLoop( 1965 seqnum.Size(e.receiveBufferAvailable())) 1966 1967 return nil 1968 } 1969 1970 // startAcceptedLoop sets up required state and starts a goroutine with the 1971 // main loop for accepted connections. 1972 func (e *endpoint) startAcceptedLoop(waiterQueue *waiter.Queue) { 1973 e.waiterQueue = waiterQueue 1974 e.workerRunning = true 1975 go e.protocolMainLoop(false) 1976 } 1977 1978 // Accept returns a new endpoint if a peer has established a connection 1979 // to an endpoint previously set to listen mode. 1980 func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) { 1981 e.mu.RLock() 1982 defer e.mu.RUnlock() 1983 1984 // Endpoint must be in listen state before it can accept connections. 1985 if e.state != StateListen { 1986 return nil, nil, tcpip.ErrInvalidEndpointState 1987 } 1988 1989 // Get the new accepted endpoint. 1990 var n *endpoint 1991 select { 1992 case n = <-e.acceptedChan: 1993 default: 1994 return nil, nil, tcpip.ErrWouldBlock 1995 } 1996 1997 return n, n.waiterQueue, nil 1998 } 1999 2000 // Bind binds the endpoint to a specific local port and optionally address. 2001 func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) { 2002 e.mu.Lock() 2003 defer e.mu.Unlock() 2004 2005 // Don't allow binding once endpoint is not in the initial state 2006 // anymore. This is because once the endpoint goes into a connected or 2007 // listen state, it is already bound. 2008 if e.state != StateInitial { 2009 return tcpip.ErrAlreadyBound 2010 } 2011 2012 e.BindAddr = addr.Addr 2013 netProto, err := e.checkV4Mapped(&addr) 2014 if err != nil { 2015 return err 2016 } 2017 2018 // Expand netProtos to include v4 and v6 if the caller is binding to a 2019 // wildcard (empty) address, and this is an IPv6 endpoint with v6only 2020 // set to false. 2021 netProtos := []tcpip.NetworkProtocolNumber{netProto} 2022 if netProto == header.IPv6ProtocolNumber && !e.v6only && addr.Addr == "" { 2023 netProtos = []tcpip.NetworkProtocolNumber{ 2024 header.IPv6ProtocolNumber, 2025 header.IPv4ProtocolNumber, 2026 } 2027 } 2028 2029 port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.reusePort, e.bindToDevice) 2030 if err != nil { 2031 return err 2032 } 2033 2034 e.isPortReserved = true 2035 e.effectiveNetProtos = netProtos 2036 e.ID.LocalPort = port 2037 2038 // Any failures beyond this point must remove the port registration. 2039 defer func(bindToDevice tcpip.NICID) { 2040 if err != nil { 2041 e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, bindToDevice) 2042 e.isPortReserved = false 2043 e.effectiveNetProtos = nil 2044 e.ID.LocalPort = 0 2045 e.ID.LocalAddress = "" 2046 e.boundNICID = 0 2047 } 2048 }(e.bindToDevice) 2049 2050 // If an address is specified, we must ensure that it's one of our 2051 // local addresses. 2052 if len(addr.Addr) != 0 { 2053 nic := e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) 2054 if nic == 0 { 2055 return tcpip.ErrBadLocalAddress 2056 } 2057 2058 e.boundNICID = nic 2059 e.ID.LocalAddress = addr.Addr 2060 } 2061 2062 // Mark endpoint as bound. 2063 e.state = StateBound 2064 2065 return nil 2066 } 2067 2068 // GetLocalAddress returns the address to which the endpoint is bound. 2069 func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { 2070 e.mu.RLock() 2071 defer e.mu.RUnlock() 2072 2073 return tcpip.FullAddress{ 2074 Addr: e.ID.LocalAddress, 2075 Port: e.ID.LocalPort, 2076 NIC: e.boundNICID, 2077 }, nil 2078 } 2079 2080 // GetRemoteAddress returns the address to which the endpoint is connected. 2081 func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { 2082 e.mu.RLock() 2083 defer e.mu.RUnlock() 2084 2085 if !e.state.connected() { 2086 return tcpip.FullAddress{}, tcpip.ErrNotConnected 2087 } 2088 2089 return tcpip.FullAddress{ 2090 Addr: e.ID.RemoteAddress, 2091 Port: e.ID.RemotePort, 2092 NIC: e.boundNICID, 2093 }, nil 2094 } 2095 2096 // HandlePacket is called by the stack when new packets arrive to this transport 2097 // endpoint. 2098 func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) { 2099 s := newSegment(r, id, pkt) 2100 if !s.parse() { 2101 e.stack.Stats().MalformedRcvdPackets.Increment() 2102 e.stack.Stats().TCP.InvalidSegmentsReceived.Increment() 2103 e.stats.ReceiveErrors.MalformedPacketsReceived.Increment() 2104 s.decRef() 2105 return 2106 } 2107 2108 if !s.csumValid { 2109 e.stack.Stats().MalformedRcvdPackets.Increment() 2110 e.stack.Stats().TCP.ChecksumErrors.Increment() 2111 e.stats.ReceiveErrors.ChecksumErrors.Increment() 2112 s.decRef() 2113 return 2114 } 2115 2116 e.stack.Stats().TCP.ValidSegmentsReceived.Increment() 2117 e.stats.SegmentsReceived.Increment() 2118 if (s.flags & header.TCPFlagRst) != 0 { 2119 e.stack.Stats().TCP.ResetsReceived.Increment() 2120 } 2121 2122 e.enqueueSegment(s) 2123 } 2124 2125 func (e *endpoint) enqueueSegment(s *segment) { 2126 // Send packet to worker goroutine. 2127 if e.segmentQueue.enqueue(s) { 2128 e.newSegmentWaker.Assert() 2129 } else { 2130 // The queue is full, so we drop the segment. 2131 e.stack.Stats().DroppedPackets.Increment() 2132 e.stats.ReceiveErrors.SegmentQueueDropped.Increment() 2133 s.decRef() 2134 } 2135 } 2136 2137 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket. 2138 func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) { 2139 switch typ { 2140 case stack.ControlPacketTooBig: 2141 e.sndBufMu.Lock() 2142 e.packetTooBigCount++ 2143 if v := int(extra); v < e.sndMTU { 2144 e.sndMTU = v 2145 } 2146 e.sndBufMu.Unlock() 2147 2148 e.notifyProtocolGoroutine(notifyMTUChanged) 2149 } 2150 } 2151 2152 // updateSndBufferUsage is called by the protocol goroutine when room opens up 2153 // in the send buffer. The number of newly available bytes is v. 2154 func (e *endpoint) updateSndBufferUsage(v int) { 2155 e.sndBufMu.Lock() 2156 notify := e.sndBufUsed >= e.sndBufSize>>1 2157 e.sndBufUsed -= v 2158 // We only notify when there is half the sndBufSize available after 2159 // a full buffer event occurs. This ensures that we don't wake up 2160 // writers to queue just 1-2 segments and go back to sleep. 2161 notify = notify && e.sndBufUsed < e.sndBufSize>>1 2162 e.sndBufMu.Unlock() 2163 2164 if notify { 2165 e.waiterQueue.Notify(waiter.EventOut) 2166 } 2167 } 2168 2169 // readyToRead is called by the protocol goroutine when a new segment is ready 2170 // to be read, or when the connection is closed for receiving (in which case 2171 // s will be nil). 2172 func (e *endpoint) readyToRead(s *segment) { 2173 e.rcvListMu.Lock() 2174 if s != nil { 2175 s.incRef() 2176 e.rcvBufUsed += s.data.Size() 2177 // Check if the receive window is now closed. If so make sure 2178 // we set the zero window before we deliver the segment to ensure 2179 // that a subsequent read of the segment will correctly trigger 2180 // a non-zero notification. 2181 if avail := e.receiveBufferAvailableLocked(); avail>>e.rcv.rcvWndScale == 0 { 2182 e.stats.ReceiveErrors.ZeroRcvWindowState.Increment() 2183 e.zeroWindow = true 2184 } 2185 e.rcvList.PushBack(s) 2186 } else { 2187 e.rcvClosed = true 2188 } 2189 e.rcvListMu.Unlock() 2190 2191 e.waiterQueue.Notify(waiter.EventIn) 2192 } 2193 2194 // receiveBufferAvailableLocked calculates how many bytes are still available 2195 // in the receive buffer. 2196 // rcvListMu must be held when this function is called. 2197 func (e *endpoint) receiveBufferAvailableLocked() int { 2198 // We may use more bytes than the buffer size when the receive buffer 2199 // shrinks. 2200 if e.rcvBufUsed >= e.rcvBufSize { 2201 return 0 2202 } 2203 2204 return e.rcvBufSize - e.rcvBufUsed 2205 } 2206 2207 // receiveBufferAvailable calculates how many bytes are still available in the 2208 // receive buffer. 2209 func (e *endpoint) receiveBufferAvailable() int { 2210 e.rcvListMu.Lock() 2211 available := e.receiveBufferAvailableLocked() 2212 e.rcvListMu.Unlock() 2213 return available 2214 } 2215 2216 func (e *endpoint) receiveBufferSize() int { 2217 e.rcvListMu.Lock() 2218 size := e.rcvBufSize 2219 e.rcvListMu.Unlock() 2220 2221 return size 2222 } 2223 2224 func (e *endpoint) maxReceiveBufferSize() int { 2225 var rs ReceiveBufferSizeOption 2226 if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil { 2227 // As a fallback return the hardcoded max buffer size. 2228 return MaxBufferSize 2229 } 2230 return rs.Max 2231 } 2232 2233 // rcvWndScaleForHandshake computes the receive window scale to offer to the 2234 // peer when window scaling is enabled (true by default). If auto-tuning is 2235 // disabled then the window scaling factor is based on the size of the 2236 // receiveBuffer otherwise we use the max permissible receive buffer size to 2237 // compute the scale. 2238 func (e *endpoint) rcvWndScaleForHandshake() int { 2239 bufSizeForScale := e.receiveBufferSize() 2240 2241 e.rcvListMu.Lock() 2242 autoTuningDisabled := e.rcvAutoParams.disabled 2243 e.rcvListMu.Unlock() 2244 if autoTuningDisabled { 2245 return FindWndScale(seqnum.Size(bufSizeForScale)) 2246 } 2247 2248 return FindWndScale(seqnum.Size(e.maxReceiveBufferSize())) 2249 } 2250 2251 // updateRecentTimestamp updates the recent timestamp using the algorithm 2252 // described in https://tools.ietf.org/html/rfc7323#section-4.3 2253 func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) { 2254 if e.sendTSOk && seqnum.Value(e.recentTS).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) { 2255 e.recentTS = tsVal 2256 } 2257 } 2258 2259 // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if 2260 // the SYN options indicate that timestamp option was negotiated. It also 2261 // initializes the recentTS with the value provided in synOpts.TSval. 2262 func (e *endpoint) maybeEnableTimestamp(synOpts *header.TCPSynOptions) { 2263 if synOpts.TS { 2264 e.sendTSOk = true 2265 e.recentTS = synOpts.TSVal 2266 } 2267 } 2268 2269 // timestamp returns the timestamp value to be used in the TSVal field of the 2270 // timestamp option for outgoing TCP segments for a given endpoint. 2271 func (e *endpoint) timestamp() uint32 { 2272 return tcpTimeStamp(e.tsOffset) 2273 } 2274 2275 // tcpTimeStamp returns a timestamp offset by the provided offset. This is 2276 // not inlined above as it's used when SYN cookies are in use and endpoint 2277 // is not created at the time when the SYN cookie is sent. 2278 func tcpTimeStamp(offset uint32) uint32 { 2279 now := time.Now() 2280 return uint32(now.Unix()*1000+int64(now.Nanosecond()/1e6)) + offset 2281 } 2282 2283 // timeStampOffset returns a randomized timestamp offset to be used when sending 2284 // timestamp values in a timestamp option for a TCP segment. 2285 func timeStampOffset() uint32 { 2286 b := make([]byte, 4) 2287 if _, err := rand.Read(b); err != nil { 2288 panic(err) 2289 } 2290 // Initialize a random tsOffset that will be added to the recentTS 2291 // everytime the timestamp is sent when the Timestamp option is enabled. 2292 // 2293 // See https://tools.ietf.org/html/rfc7323#section-5.4 for details on 2294 // why this is required. 2295 // 2296 // NOTE: This is not completely to spec as normally this should be 2297 // initialized in a manner analogous to how sequence numbers are 2298 // randomized per connection basis. But for now this is sufficient. 2299 return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 2300 } 2301 2302 // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint 2303 // if the SYN options indicate that the SACK option was negotiated and the TCP 2304 // stack is configured to enable TCP SACK option. 2305 func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) { 2306 var v SACKEnabled 2307 if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil { 2308 // Stack doesn't support SACK. So just return. 2309 return 2310 } 2311 if bool(v) && synOpts.SACKPermitted { 2312 e.sackPermitted = true 2313 } 2314 } 2315 2316 // maxOptionSize return the maximum size of TCP options. 2317 func (e *endpoint) maxOptionSize() (size int) { 2318 var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock 2319 options := e.makeOptions(maxSackBlocks[:]) 2320 size = len(options) 2321 putOptions(options) 2322 2323 return size 2324 } 2325 2326 // completeState makes a full copy of the endpoint and returns it. This is used 2327 // before invoking the probe. The state returned may not be fully consistent if 2328 // there are intervening syscalls when the state is being copied. 2329 func (e *endpoint) completeState() stack.TCPEndpointState { 2330 var s stack.TCPEndpointState 2331 s.SegTime = time.Now() 2332 2333 // Copy EndpointID. 2334 e.mu.Lock() 2335 s.ID = stack.TCPEndpointID(e.ID) 2336 e.mu.Unlock() 2337 2338 // Copy endpoint rcv state. 2339 e.rcvListMu.Lock() 2340 s.RcvBufSize = e.rcvBufSize 2341 s.RcvBufUsed = e.rcvBufUsed 2342 s.RcvClosed = e.rcvClosed 2343 s.RcvAutoParams.MeasureTime = e.rcvAutoParams.measureTime 2344 s.RcvAutoParams.CopiedBytes = e.rcvAutoParams.copied 2345 s.RcvAutoParams.PrevCopiedBytes = e.rcvAutoParams.prevCopied 2346 s.RcvAutoParams.RTT = e.rcvAutoParams.rtt 2347 s.RcvAutoParams.RTTMeasureSeqNumber = e.rcvAutoParams.rttMeasureSeqNumber 2348 s.RcvAutoParams.RTTMeasureTime = e.rcvAutoParams.rttMeasureTime 2349 s.RcvAutoParams.Disabled = e.rcvAutoParams.disabled 2350 e.rcvListMu.Unlock() 2351 2352 // Endpoint TCP Option state. 2353 s.SendTSOk = e.sendTSOk 2354 s.RecentTS = e.recentTS 2355 s.TSOffset = e.tsOffset 2356 s.SACKPermitted = e.sackPermitted 2357 s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks) 2358 copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks]) 2359 s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy() 2360 2361 // Copy endpoint send state. 2362 e.sndBufMu.Lock() 2363 s.SndBufSize = e.sndBufSize 2364 s.SndBufUsed = e.sndBufUsed 2365 s.SndClosed = e.sndClosed 2366 s.SndBufInQueue = e.sndBufInQueue 2367 s.PacketTooBigCount = e.packetTooBigCount 2368 s.SndMTU = e.sndMTU 2369 e.sndBufMu.Unlock() 2370 2371 // Copy receiver state. 2372 s.Receiver = stack.TCPReceiverState{ 2373 RcvNxt: e.rcv.rcvNxt, 2374 RcvAcc: e.rcv.rcvAcc, 2375 RcvWndScale: e.rcv.rcvWndScale, 2376 PendingBufUsed: e.rcv.pendingBufUsed, 2377 PendingBufSize: e.rcv.pendingBufSize, 2378 } 2379 2380 // Copy sender state. 2381 s.Sender = stack.TCPSenderState{ 2382 LastSendTime: e.snd.lastSendTime, 2383 DupAckCount: e.snd.dupAckCount, 2384 FastRecovery: stack.TCPFastRecoveryState{ 2385 Active: e.snd.fr.active, 2386 First: e.snd.fr.first, 2387 Last: e.snd.fr.last, 2388 MaxCwnd: e.snd.fr.maxCwnd, 2389 HighRxt: e.snd.fr.highRxt, 2390 RescueRxt: e.snd.fr.rescueRxt, 2391 }, 2392 SndCwnd: e.snd.sndCwnd, 2393 Ssthresh: e.snd.sndSsthresh, 2394 SndCAAckCount: e.snd.sndCAAckCount, 2395 Outstanding: e.snd.outstanding, 2396 SndWnd: e.snd.sndWnd, 2397 SndUna: e.snd.sndUna, 2398 SndNxt: e.snd.sndNxt, 2399 RTTMeasureSeqNum: e.snd.rttMeasureSeqNum, 2400 RTTMeasureTime: e.snd.rttMeasureTime, 2401 Closed: e.snd.closed, 2402 RTO: e.snd.rto, 2403 MaxPayloadSize: e.snd.maxPayloadSize, 2404 SndWndScale: e.snd.sndWndScale, 2405 MaxSentAck: e.snd.maxSentAck, 2406 } 2407 e.snd.rtt.Lock() 2408 s.Sender.SRTT = e.snd.rtt.srtt 2409 s.Sender.SRTTInited = e.snd.rtt.srttInited 2410 e.snd.rtt.Unlock() 2411 2412 if cubic, ok := e.snd.cc.(*cubicState); ok { 2413 s.Sender.Cubic = stack.TCPCubicState{ 2414 WMax: cubic.wMax, 2415 WLastMax: cubic.wLastMax, 2416 T: cubic.t, 2417 TimeSinceLastCongestion: time.Since(cubic.t), 2418 C: cubic.c, 2419 K: cubic.k, 2420 Beta: cubic.beta, 2421 WC: cubic.wC, 2422 WEst: cubic.wEst, 2423 } 2424 } 2425 return s 2426 } 2427 2428 func (e *endpoint) initHardwareGSO() { 2429 gso := &stack.GSO{} 2430 switch e.route.NetProto { 2431 case header.IPv4ProtocolNumber: 2432 gso.Type = stack.GSOTCPv4 2433 gso.L3HdrLen = header.IPv4MinimumSize 2434 case header.IPv6ProtocolNumber: 2435 gso.Type = stack.GSOTCPv6 2436 gso.L3HdrLen = header.IPv6MinimumSize 2437 default: 2438 panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto)) 2439 } 2440 gso.NeedsCsum = true 2441 gso.CsumOffset = header.TCPChecksumOffset 2442 gso.MaxSize = e.route.GSOMaxSize() 2443 e.gso = gso 2444 } 2445 2446 func (e *endpoint) initGSO() { 2447 if e.route.Capabilities()&stack.CapabilityHardwareGSO != 0 { 2448 e.initHardwareGSO() 2449 } else if e.route.Capabilities()&stack.CapabilitySoftwareGSO != 0 { 2450 e.gso = &stack.GSO{ 2451 MaxSize: e.route.GSOMaxSize(), 2452 Type: stack.GSOSW, 2453 NeedsCsum: false, 2454 } 2455 } 2456 } 2457 2458 // State implements tcpip.Endpoint.State. It exports the endpoint's protocol 2459 // state for diagnostics. 2460 func (e *endpoint) State() uint32 { 2461 e.mu.Lock() 2462 defer e.mu.Unlock() 2463 return uint32(e.state) 2464 } 2465 2466 // Info returns a copy of the endpoint info. 2467 func (e *endpoint) Info() tcpip.EndpointInfo { 2468 e.mu.RLock() 2469 // Make a copy of the endpoint info. 2470 ret := e.EndpointInfo 2471 e.mu.RUnlock() 2472 return &ret 2473 } 2474 2475 // Stats returns a pointer to the endpoint stats. 2476 func (e *endpoint) Stats() tcpip.EndpointStats { 2477 return &e.stats 2478 } 2479 2480 // Wait implements stack.TransportEndpoint.Wait. 2481 func (e *endpoint) Wait() { 2482 waitEntry, notifyCh := waiter.NewChannelEntry(nil) 2483 e.waiterQueue.EventRegister(&waitEntry, waiter.EventHUp) 2484 defer e.waiterQueue.EventUnregister(&waitEntry) 2485 for { 2486 e.mu.Lock() 2487 running := e.workerRunning 2488 e.mu.Unlock() 2489 if !running { 2490 break 2491 } 2492 <-notifyCh 2493 } 2494 } 2495 2496 func mssForRoute(r *stack.Route) uint16 { 2497 // TODO(b/143359391): Respect TCP Min and Max size. 2498 return uint16(r.MTU() - header.TCPMinimumSize) 2499 }