github.com/flowerwrong/netstack@v0.0.0-20191009141956-e5848263af28/tcpip/transport/tcp/endpoint.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "encoding/binary" 19 "fmt" 20 "math" 21 "strings" 22 "sync" 23 "sync/atomic" 24 "time" 25 26 "github.com/FlowerWrong/netstack/rand" 27 "github.com/FlowerWrong/netstack/sleep" 28 "github.com/FlowerWrong/netstack/tcpip" 29 "github.com/FlowerWrong/netstack/tcpip/buffer" 30 "github.com/FlowerWrong/netstack/tcpip/hash/jenkins" 31 "github.com/FlowerWrong/netstack/tcpip/header" 32 "github.com/FlowerWrong/netstack/tcpip/iptables" 33 "github.com/FlowerWrong/netstack/tcpip/seqnum" 34 "github.com/FlowerWrong/netstack/tcpip/stack" 35 "github.com/FlowerWrong/netstack/tmutex" 36 "github.com/FlowerWrong/netstack/waiter" 37 ) 38 39 // EndpointState represents the state of a TCP endpoint. 40 type EndpointState uint32 41 42 // Endpoint states. Note that are represented in a netstack-specific manner and 43 // may not be meaningful externally. Specifically, they need to be translated to 44 // Linux's representation for these states if presented to userspace. 45 const ( 46 // Endpoint states internal to netstack. These map to the TCP state CLOSED. 47 StateInitial EndpointState = iota 48 StateBound 49 StateConnecting // Connect() called, but the initial SYN hasn't been sent. 50 StateError 51 52 // TCP protocol states. 53 StateEstablished 54 StateSynSent 55 StateSynRecv 56 StateFinWait1 57 StateFinWait2 58 StateTimeWait 59 StateClose 60 StateCloseWait 61 StateLastAck 62 StateListen 63 StateClosing 64 ) 65 66 // connected is the set of states where an endpoint is connected to a peer. 67 func (s EndpointState) connected() bool { 68 switch s { 69 case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing: 70 return true 71 default: 72 return false 73 } 74 } 75 76 // String implements fmt.Stringer.String. 77 func (s EndpointState) String() string { 78 switch s { 79 case StateInitial: 80 return "INITIAL" 81 case StateBound: 82 return "BOUND" 83 case StateConnecting: 84 return "CONNECTING" 85 case StateError: 86 return "ERROR" 87 case StateEstablished: 88 return "ESTABLISHED" 89 case StateSynSent: 90 return "SYN-SENT" 91 case StateSynRecv: 92 return "SYN-RCVD" 93 case StateFinWait1: 94 return "FIN-WAIT1" 95 case StateFinWait2: 96 return "FIN-WAIT2" 97 case StateTimeWait: 98 return "TIME-WAIT" 99 case StateClose: 100 return "CLOSED" 101 case StateCloseWait: 102 return "CLOSE-WAIT" 103 case StateLastAck: 104 return "LAST-ACK" 105 case StateListen: 106 return "LISTEN" 107 case StateClosing: 108 return "CLOSING" 109 default: 110 panic("unreachable") 111 } 112 } 113 114 // Reasons for notifying the protocol goroutine. 115 const ( 116 notifyNonZeroReceiveWindow = 1 << iota 117 notifyReceiveWindowChanged 118 notifyClose 119 notifyMTUChanged 120 notifyDrain 121 notifyReset 122 notifyKeepaliveChanged 123 notifyMSSChanged 124 ) 125 126 // SACKInfo holds TCP SACK related information for a given endpoint. 127 // 128 // +stateify savable 129 type SACKInfo struct { 130 // Blocks is the maximum number of SACK blocks we track 131 // per endpoint. 132 Blocks [MaxSACKBlocks]header.SACKBlock 133 134 // NumBlocks is the number of valid SACK blocks stored in the 135 // blocks array above. 136 NumBlocks int 137 } 138 139 // rcvBufAutoTuneParams are used to hold state variables to compute 140 // the auto tuned recv buffer size. 141 // 142 // +stateify savable 143 type rcvBufAutoTuneParams struct { 144 // measureTime is the time at which the current measurement 145 // was started. 146 measureTime time.Time 147 148 // copied is the number of bytes copied out of the receive 149 // buffers since this measure began. 150 copied int 151 152 // prevCopied is the number of bytes copied out of the receive 153 // buffers in the previous RTT period. 154 prevCopied int 155 156 // rtt is the non-smoothed minimum RTT as measured by observing the time 157 // between when a byte is first acknowledged and the receipt of data 158 // that is at least one window beyond the sequence number that was 159 // acknowledged. 160 rtt time.Duration 161 162 // rttMeasureSeqNumber is the highest acceptable sequence number at the 163 // time this RTT measurement period began. 164 rttMeasureSeqNumber seqnum.Value 165 166 // rttMeasureTime is the absolute time at which the current rtt 167 // measurement period began. 168 rttMeasureTime time.Time 169 170 // disabled is true if an explicit receive buffer is set for the 171 // endpoint. 172 disabled bool 173 } 174 175 // endpoint represents a TCP endpoint. This struct serves as the interface 176 // between users of the endpoint and the protocol implementation; it is legal to 177 // have concurrent goroutines make calls into the endpoint, they are properly 178 // synchronized. The protocol implementation, however, runs in a single 179 // goroutine. 180 // 181 // +stateify savable 182 type endpoint struct { 183 // workMu is used to arbitrate which goroutine may perform protocol 184 // work. Only the main protocol goroutine is expected to call Lock() on 185 // it, but other goroutines (e.g., send) may call TryLock() to eagerly 186 // perform work without having to wait for the main one to wake up. 187 workMu tmutex.Mutex 188 189 // The following fields are initialized at creation time and do not 190 // change throughout the lifetime of the endpoint. 191 stack *stack.Stack 192 netProto tcpip.NetworkProtocolNumber 193 waiterQueue *waiter.Queue 194 195 // lastError represents the last error that the endpoint reported; 196 // access to it is protected by the following mutex. 197 lastErrorMu sync.Mutex 198 lastError *tcpip.Error 199 200 // The following fields are used to manage the receive queue. The 201 // protocol goroutine adds ready-for-delivery segments to rcvList, 202 // which are returned by Read() calls to users. 203 // 204 // Once the peer has closed its send side, rcvClosed is set to true 205 // to indicate to users that no more data is coming. 206 // 207 // rcvListMu can be taken after the endpoint mu below. 208 rcvListMu sync.Mutex 209 rcvList segmentList 210 rcvClosed bool 211 rcvBufSize int 212 rcvBufUsed int 213 rcvAutoParams rcvBufAutoTuneParams 214 // zeroWindow indicates that the window was closed due to receive buffer 215 // space being filled up. This is set by the worker goroutine before 216 // moving a segment to the rcvList. This setting is cleared by the 217 // endpoint when a Read() call reads enough data for the new window to 218 // be non-zero. 219 zeroWindow bool 220 221 // The following fields are protected by the mutex. 222 mu sync.RWMutex 223 id stack.TransportEndpointID 224 225 state EndpointState 226 227 isPortReserved bool 228 isRegistered bool 229 boundNICID tcpip.NICID 230 route stack.Route 231 ttl uint8 232 v6only bool 233 isConnectNotified bool 234 // TCP should never broadcast but Linux nevertheless supports enabling/ 235 // disabling SO_BROADCAST, albeit as a NOOP. 236 broadcast bool 237 238 // effectiveNetProtos contains the network protocols actually in use. In 239 // most cases it will only contain "netProto", but in cases like IPv6 240 // endpoints with v6only set to false, this could include multiple 241 // protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g., 242 // IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped 243 // address). 244 effectiveNetProtos []tcpip.NetworkProtocolNumber 245 246 // hardError is meaningful only when state is stateError, it stores the 247 // error to be returned when read/write syscalls are called and the 248 // endpoint is in this state. hardError is protected by mu. 249 hardError *tcpip.Error 250 251 // workerRunning specifies if a worker goroutine is running. 252 workerRunning bool 253 254 // workerCleanup specifies if the worker goroutine must perform cleanup 255 // before exitting. This can only be set to true when workerRunning is 256 // also true, and they're both protected by the mutex. 257 workerCleanup bool 258 259 // sendTSOk is used to indicate when the TS Option has been negotiated. 260 // When sendTSOk is true every non-RST segment should carry a TS as per 261 // RFC7323#section-1.1 262 sendTSOk bool 263 264 // recentTS is the timestamp that should be sent in the TSEcr field of 265 // the timestamp for future segments sent by the endpoint. This field is 266 // updated if required when a new segment is received by this endpoint. 267 recentTS uint32 268 269 // tsOffset is a randomized offset added to the value of the 270 // TSVal field in the timestamp option. 271 tsOffset uint32 272 273 // shutdownFlags represent the current shutdown state of the endpoint. 274 shutdownFlags tcpip.ShutdownFlags 275 276 // sackPermitted is set to true if the peer sends the TCPSACKPermitted 277 // option in the SYN/SYN-ACK. 278 sackPermitted bool 279 280 // sack holds TCP SACK related information for this endpoint. 281 sack SACKInfo 282 283 // reusePort is set to true if SO_REUSEPORT is enabled. 284 reusePort bool 285 286 // bindToDevice is set to the NIC on which to bind or disabled if 0. 287 bindToDevice tcpip.NICID 288 289 // delay enables Nagle's algorithm. 290 // 291 // delay is a boolean (0 is false) and must be accessed atomically. 292 delay uint32 293 294 // cork holds back segments until full. 295 // 296 // cork is a boolean (0 is false) and must be accessed atomically. 297 cork uint32 298 299 // scoreboard holds TCP SACK Scoreboard information for this endpoint. 300 scoreboard *SACKScoreboard 301 302 // The options below aren't implemented, but we remember the user 303 // settings because applications expect to be able to set/query these 304 // options. 305 reuseAddr bool 306 307 // slowAck holds the negated state of quick ack. It is stubbed out and 308 // does nothing. 309 // 310 // slowAck is a boolean (0 is false) and must be accessed atomically. 311 slowAck uint32 312 313 // segmentQueue is used to hand received segments to the protocol 314 // goroutine. Segments are queued as long as the queue is not full, 315 // and dropped when it is. 316 segmentQueue segmentQueue 317 318 // synRcvdCount is the number of connections for this endpoint that are 319 // in SYN-RCVD state. 320 synRcvdCount int 321 322 // userMSS if non-zero is the MSS value explicitly set by the user 323 // for this endpoint using the TCP_MAXSEG setsockopt. 324 userMSS int 325 326 // The following fields are used to manage the send buffer. When 327 // segments are ready to be sent, they are added to sndQueue and the 328 // protocol goroutine is signaled via sndWaker. 329 // 330 // When the send side is closed, the protocol goroutine is notified via 331 // sndCloseWaker, and sndClosed is set to true. 332 sndBufMu sync.Mutex 333 sndBufSize int 334 sndBufUsed int 335 sndClosed bool 336 sndBufInQueue seqnum.Size 337 sndQueue segmentList 338 sndWaker sleep.Waker 339 sndCloseWaker sleep.Waker 340 341 // cc stores the name of the Congestion Control algorithm to use for 342 // this endpoint. 343 cc tcpip.CongestionControlOption 344 345 // The following are used when a "packet too big" control packet is 346 // received. They are protected by sndBufMu. They are used to 347 // communicate to the main protocol goroutine how many such control 348 // messages have been received since the last notification was processed 349 // and what was the smallest MTU seen. 350 packetTooBigCount int 351 sndMTU int 352 353 // newSegmentWaker is used to indicate to the protocol goroutine that 354 // it needs to wake up and handle new segments queued to it. 355 newSegmentWaker sleep.Waker 356 357 // notificationWaker is used to indicate to the protocol goroutine that 358 // it needs to wake up and check for notifications. 359 notificationWaker sleep.Waker 360 361 // notifyFlags is a bitmask of flags used to indicate to the protocol 362 // goroutine what it was notified; this is only accessed atomically. 363 notifyFlags uint32 364 365 // keepalive manages TCP keepalive state. When the connection is idle 366 // (no data sent or received) for keepaliveIdle, we start sending 367 // keepalives every keepalive.interval. If we send keepalive.count 368 // without hearing a response, the connection is closed. 369 keepalive keepalive 370 371 // pendingAccepted is a synchronization primitive used to track number 372 // of connections that are queued up to be delivered to the accepted 373 // channel. We use this to ensure that all goroutines blocked on writing 374 // to the acceptedChan below terminate before we close acceptedChan. 375 pendingAccepted sync.WaitGroup 376 377 // acceptedChan is used by a listening endpoint protocol goroutine to 378 // send newly accepted connections to the endpoint so that they can be 379 // read by Accept() calls. 380 acceptedChan chan *endpoint 381 382 // The following are only used from the protocol goroutine, and 383 // therefore don't need locks to protect them. 384 rcv *receiver 385 snd *sender 386 387 // The goroutine drain completion notification channel. 388 drainDone chan struct{} 389 390 // The goroutine undrain notification channel. This is currently used as 391 // a way to block the worker goroutines. Today nothing closes/writes 392 // this channel and this causes any goroutines waiting on this to just 393 // block. This is used during save/restore to prevent worker goroutines 394 // from mutating state as it's being saved. 395 undrain chan struct{} 396 397 // probe if not nil is invoked on every received segment. It is passed 398 // a copy of the current state of the endpoint. 399 probe stack.TCPProbeFunc 400 401 // The following are only used to assist the restore run to re-connect. 402 bindAddress tcpip.Address 403 connectingAddress tcpip.Address 404 405 // amss is the advertised MSS to the peer by this endpoint. 406 amss uint16 407 408 gso *stack.GSO 409 } 410 411 // StopWork halts packet processing. Only to be used in tests. 412 func (e *endpoint) StopWork() { 413 e.workMu.Lock() 414 } 415 416 // ResumeWork resumes packet processing. Only to be used in tests. 417 func (e *endpoint) ResumeWork() { 418 e.workMu.Unlock() 419 } 420 421 // keepalive is a synchronization wrapper used to appease stateify. See the 422 // comment in endpoint, where it is used. 423 // 424 // +stateify savable 425 type keepalive struct { 426 sync.Mutex 427 enabled bool 428 idle time.Duration 429 interval time.Duration 430 count int 431 unacked int 432 timer timer 433 waker sleep.Waker 434 } 435 436 func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint { 437 e := &endpoint{ 438 stack: stack, 439 netProto: netProto, 440 waiterQueue: waiterQueue, 441 state: StateInitial, 442 rcvBufSize: DefaultReceiveBufferSize, 443 sndBufSize: DefaultSendBufferSize, 444 sndMTU: int(math.MaxInt32), 445 reuseAddr: true, 446 keepalive: keepalive{ 447 // Linux defaults. 448 idle: 2 * time.Hour, 449 interval: 75 * time.Second, 450 count: 9, 451 }, 452 } 453 454 var ss SendBufferSizeOption 455 if err := stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil { 456 e.sndBufSize = ss.Default 457 } 458 459 var rs ReceiveBufferSizeOption 460 if err := stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil { 461 e.rcvBufSize = rs.Default 462 } 463 464 var cs tcpip.CongestionControlOption 465 if err := stack.TransportProtocolOption(ProtocolNumber, &cs); err == nil { 466 e.cc = cs 467 } 468 469 var mrb tcpip.ModerateReceiveBufferOption 470 if err := stack.TransportProtocolOption(ProtocolNumber, &mrb); err == nil { 471 e.rcvAutoParams.disabled = !bool(mrb) 472 } 473 474 if p := stack.GetTCPProbe(); p != nil { 475 e.probe = p 476 } 477 478 e.segmentQueue.setLimit(MaxUnprocessedSegments) 479 e.workMu.Init() 480 e.workMu.Lock() 481 e.tsOffset = timeStampOffset() 482 483 return e 484 } 485 486 // Readiness returns the current readiness of the endpoint. For example, if 487 // waiter.EventIn is set, the endpoint is immediately readable. 488 func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { 489 result := waiter.EventMask(0) 490 491 e.mu.RLock() 492 defer e.mu.RUnlock() 493 494 switch e.state { 495 case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv: 496 // Ready for nothing. 497 498 case StateClose, StateError: 499 // Ready for anything. 500 result = mask 501 502 case StateListen: 503 // Check if there's anything in the accepted channel. 504 if (mask & waiter.EventIn) != 0 { 505 if len(e.acceptedChan) > 0 { 506 result |= waiter.EventIn 507 } 508 } 509 } 510 if e.state.connected() { 511 // Determine if the endpoint is writable if requested. 512 if (mask & waiter.EventOut) != 0 { 513 e.sndBufMu.Lock() 514 if e.sndClosed || e.sndBufUsed < e.sndBufSize { 515 result |= waiter.EventOut 516 } 517 e.sndBufMu.Unlock() 518 } 519 520 // Determine if the endpoint is readable if requested. 521 if (mask & waiter.EventIn) != 0 { 522 e.rcvListMu.Lock() 523 if e.rcvBufUsed > 0 || e.rcvClosed { 524 result |= waiter.EventIn 525 } 526 e.rcvListMu.Unlock() 527 } 528 } 529 530 return result 531 } 532 533 func (e *endpoint) fetchNotifications() uint32 { 534 return atomic.SwapUint32(&e.notifyFlags, 0) 535 } 536 537 func (e *endpoint) notifyProtocolGoroutine(n uint32) { 538 for { 539 v := atomic.LoadUint32(&e.notifyFlags) 540 if v&n == n { 541 // The flags are already set. 542 return 543 } 544 545 if atomic.CompareAndSwapUint32(&e.notifyFlags, v, v|n) { 546 if v == 0 { 547 // We are causing a transition from no flags to 548 // at least one flag set, so we must cause the 549 // protocol goroutine to wake up. 550 e.notificationWaker.Assert() 551 } 552 return 553 } 554 } 555 } 556 557 // Close puts the endpoint in a closed state and frees all resources associated 558 // with it. It must be called only once and with no other concurrent calls to 559 // the endpoint. 560 func (e *endpoint) Close() { 561 // Issue a shutdown so that the peer knows we won't send any more data 562 // if we're connected, or stop accepting if we're listening. 563 e.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead) 564 565 e.mu.Lock() 566 567 // For listening sockets, we always release ports inline so that they 568 // are immediately available for reuse after Close() is called. If also 569 // registered, we unregister as well otherwise the next user would fail 570 // in Listen() when trying to register. 571 if e.state == StateListen && e.isPortReserved { 572 if e.isRegistered { 573 e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e, e.bindToDevice) 574 e.isRegistered = false 575 } 576 577 e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.id.LocalAddress, e.id.LocalPort, e.bindToDevice) 578 e.isPortReserved = false 579 } 580 581 // Either perform the local cleanup or kick the worker to make sure it 582 // knows it needs to cleanup. 583 tcpip.AddDanglingEndpoint(e) 584 if !e.workerRunning { 585 e.cleanupLocked() 586 } else { 587 e.workerCleanup = true 588 e.notifyProtocolGoroutine(notifyClose) 589 } 590 591 e.mu.Unlock() 592 } 593 594 // closePendingAcceptableConnections closes all connections that have completed 595 // handshake but not yet been delivered to the application. 596 func (e *endpoint) closePendingAcceptableConnectionsLocked() { 597 done := make(chan struct{}) 598 // Spin a goroutine up as ranging on e.acceptedChan will just block when 599 // there are no more connections in the channel. Using a non-blocking 600 // select does not work as it can potentially select the default case 601 // even when there are pending writes but that are not yet written to 602 // the channel. 603 go func() { 604 defer close(done) 605 for n := range e.acceptedChan { 606 n.mu.Lock() 607 n.resetConnectionLocked(tcpip.ErrConnectionAborted) 608 n.mu.Unlock() 609 n.Close() 610 } 611 }() 612 // pendingAccepted(see endpoint.deliverAccepted) tracks the number of 613 // endpoints which have completed handshake but are not yet written to 614 // the e.acceptedChan. We wait here till the goroutine above can drain 615 // all such connections from e.acceptedChan. 616 e.pendingAccepted.Wait() 617 close(e.acceptedChan) 618 <-done 619 e.acceptedChan = nil 620 } 621 622 // cleanupLocked frees all resources associated with the endpoint. It is called 623 // after Close() is called and the worker goroutine (if any) is done with its 624 // work. 625 func (e *endpoint) cleanupLocked() { 626 // Close all endpoints that might have been accepted by TCP but not by 627 // the client. 628 if e.acceptedChan != nil { 629 e.closePendingAcceptableConnectionsLocked() 630 } 631 e.workerCleanup = false 632 633 if e.isRegistered { 634 e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e, e.bindToDevice) 635 e.isRegistered = false 636 } 637 638 if e.isPortReserved { 639 e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.id.LocalAddress, e.id.LocalPort, e.bindToDevice) 640 e.isPortReserved = false 641 } 642 643 e.route.Release() 644 tcpip.DeleteDanglingEndpoint(e) 645 } 646 647 // initialReceiveWindow returns the initial receive window to advertise in the 648 // SYN/SYN-ACK. 649 func (e *endpoint) initialReceiveWindow() int { 650 rcvWnd := e.receiveBufferAvailable() 651 if rcvWnd > math.MaxUint16 { 652 rcvWnd = math.MaxUint16 653 } 654 routeWnd := InitialCwnd * int(mssForRoute(&e.route)) * 2 655 if rcvWnd > routeWnd { 656 rcvWnd = routeWnd 657 } 658 return rcvWnd 659 } 660 661 // ModerateRecvBuf adjusts the receive buffer and the advertised window 662 // based on the number of bytes copied to user space. 663 func (e *endpoint) ModerateRecvBuf(copied int) { 664 e.rcvListMu.Lock() 665 if e.rcvAutoParams.disabled { 666 e.rcvListMu.Unlock() 667 return 668 } 669 now := time.Now() 670 if rtt := e.rcvAutoParams.rtt; rtt == 0 || now.Sub(e.rcvAutoParams.measureTime) < rtt { 671 e.rcvAutoParams.copied += copied 672 e.rcvListMu.Unlock() 673 return 674 } 675 prevRTTCopied := e.rcvAutoParams.copied + copied 676 prevCopied := e.rcvAutoParams.prevCopied 677 rcvWnd := 0 678 if prevRTTCopied > prevCopied { 679 // The minimal receive window based on what was copied by the app 680 // in the immediate preceding RTT and some extra buffer for 16 681 // segments to account for variations. 682 // We multiply by 2 to account for packet losses. 683 rcvWnd = prevRTTCopied*2 + 16*int(e.amss) 684 685 // Scale for slow start based on bytes copied in this RTT vs previous. 686 grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied 687 688 // Multiply growth factor by 2 again to account for sender being 689 // in slow-start where the sender grows it's congestion window 690 // by 100% per RTT. 691 rcvWnd += grow * 2 692 693 // Make sure auto tuned buffer size can always receive upto 2x 694 // the initial window of 10 segments. 695 if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd { 696 rcvWnd = minRcvWnd 697 } 698 699 // Cap the auto tuned buffer size by the maximum permissible 700 // receive buffer size. 701 if max := e.maxReceiveBufferSize(); rcvWnd > max { 702 rcvWnd = max 703 } 704 705 // We do not adjust downwards as that can cause the receiver to 706 // reject valid data that might already be in flight as the 707 // acceptable window will shrink. 708 if rcvWnd > e.rcvBufSize { 709 e.rcvBufSize = rcvWnd 710 e.notifyProtocolGoroutine(notifyReceiveWindowChanged) 711 } 712 713 // We only update prevCopied when we grow the buffer because in cases 714 // where prevCopied > prevRTTCopied the existing buffer is already big 715 // enough to handle the current rate and we don't need to do any 716 // adjustments. 717 e.rcvAutoParams.prevCopied = prevRTTCopied 718 } 719 e.rcvAutoParams.measureTime = now 720 e.rcvAutoParams.copied = 0 721 e.rcvListMu.Unlock() 722 } 723 724 // IPTables implements tcpip.Endpoint.IPTables. 725 func (e *endpoint) IPTables() (iptables.IPTables, error) { 726 return e.stack.IPTables(), nil 727 } 728 729 // Read reads data from the endpoint. 730 func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) { 731 e.mu.RLock() 732 // The endpoint can be read if it's connected, or if it's already closed 733 // but has some pending unread data. Also note that a RST being received 734 // would cause the state to become StateError so we should allow the 735 // reads to proceed before returning a ECONNRESET. 736 e.rcvListMu.Lock() 737 bufUsed := e.rcvBufUsed 738 if s := e.state; !s.connected() && s != StateClose && bufUsed == 0 { 739 e.rcvListMu.Unlock() 740 he := e.hardError 741 e.mu.RUnlock() 742 if s == StateError { 743 return buffer.View{}, tcpip.ControlMessages{}, he 744 } 745 return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState 746 } 747 748 v, err := e.readLocked() 749 e.rcvListMu.Unlock() 750 751 e.mu.RUnlock() 752 753 return v, tcpip.ControlMessages{}, err 754 } 755 756 func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) { 757 if e.rcvBufUsed == 0 { 758 if e.rcvClosed || !e.state.connected() { 759 return buffer.View{}, tcpip.ErrClosedForReceive 760 } 761 return buffer.View{}, tcpip.ErrWouldBlock 762 } 763 764 s := e.rcvList.Front() 765 views := s.data.Views() 766 v := views[s.viewToDeliver] 767 s.viewToDeliver++ 768 769 if s.viewToDeliver >= len(views) { 770 e.rcvList.Remove(s) 771 s.decRef() 772 } 773 774 e.rcvBufUsed -= len(v) 775 // If the window was zero before this read and if the read freed up 776 // enough buffer space for the scaled window to be non-zero then notify 777 // the protocol goroutine to send a window update. 778 if e.zeroWindow && !e.zeroReceiveWindow(e.rcv.rcvWndScale) { 779 e.zeroWindow = false 780 e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow) 781 } 782 783 return v, nil 784 } 785 786 // isEndpointWritableLocked checks if a given endpoint is writable 787 // and also returns the number of bytes that can be written at this 788 // moment. If the endpoint is not writable then it returns an error 789 // indicating the reason why it's not writable. 790 // Caller must hold e.mu and e.sndBufMu 791 func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) { 792 // The endpoint cannot be written to if it's not connected. 793 if !e.state.connected() { 794 switch e.state { 795 case StateError: 796 return 0, e.hardError 797 default: 798 return 0, tcpip.ErrClosedForSend 799 } 800 } 801 802 // Check if the connection has already been closed for sends. 803 if e.sndClosed { 804 return 0, tcpip.ErrClosedForSend 805 } 806 807 avail := e.sndBufSize - e.sndBufUsed 808 if avail <= 0 { 809 return 0, tcpip.ErrWouldBlock 810 } 811 return avail, nil 812 } 813 814 // Write writes data to the endpoint's peer. 815 func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) { 816 // Linux completely ignores any address passed to sendto(2) for TCP sockets 817 // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More 818 // and opts.EndOfRecord are also ignored. 819 820 e.mu.RLock() 821 e.sndBufMu.Lock() 822 823 avail, err := e.isEndpointWritableLocked() 824 if err != nil { 825 e.sndBufMu.Unlock() 826 e.mu.RUnlock() 827 return 0, nil, err 828 } 829 830 // We can release locks while copying data. 831 // 832 // This is not possible if atomic is set, because we can't allow the 833 // available buffer space to be consumed by some other caller while we 834 // are copying data in. 835 if !opts.Atomic { 836 e.sndBufMu.Unlock() 837 e.mu.RUnlock() 838 } 839 840 // Fetch data. 841 v, perr := p.Payload(avail) 842 if perr != nil || len(v) == 0 { 843 if opts.Atomic { // See above. 844 e.sndBufMu.Unlock() 845 e.mu.RUnlock() 846 } 847 // Note that perr may be nil if len(v) == 0. 848 return 0, nil, perr 849 } 850 851 if !opts.Atomic { // See above. 852 e.mu.RLock() 853 e.sndBufMu.Lock() 854 855 // Because we released the lock before copying, check state again 856 // to make sure the endpoint is still in a valid state for a write. 857 avail, err = e.isEndpointWritableLocked() 858 if err != nil { 859 e.sndBufMu.Unlock() 860 e.mu.RUnlock() 861 return 0, nil, err 862 } 863 864 // Discard any excess data copied in due to avail being reduced due 865 // to a simultaneous write call to the socket. 866 if avail < len(v) { 867 v = v[:avail] 868 } 869 } 870 871 // Add data to the send queue. 872 s := newSegmentFromView(&e.route, e.id, v) 873 e.sndBufUsed += len(v) 874 e.sndBufInQueue += seqnum.Size(len(v)) 875 e.sndQueue.PushBack(s) 876 e.sndBufMu.Unlock() 877 // Release the endpoint lock to prevent deadlocks due to lock 878 // order inversion when acquiring workMu. 879 e.mu.RUnlock() 880 881 if e.workMu.TryLock() { 882 // Do the work inline. 883 e.handleWrite() 884 e.workMu.Unlock() 885 } else { 886 // Let the protocol goroutine do the work. 887 e.sndWaker.Assert() 888 } 889 890 return int64(len(v)), nil, nil 891 } 892 893 // Peek reads data without consuming it from the endpoint. 894 // 895 // This method does not block if there is no data pending. 896 func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) { 897 e.mu.RLock() 898 defer e.mu.RUnlock() 899 900 // The endpoint can be read if it's connected, or if it's already closed 901 // but has some pending unread data. 902 if s := e.state; !s.connected() && s != StateClose { 903 if s == StateError { 904 return 0, tcpip.ControlMessages{}, e.hardError 905 } 906 return 0, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState 907 } 908 909 e.rcvListMu.Lock() 910 defer e.rcvListMu.Unlock() 911 912 if e.rcvBufUsed == 0 { 913 if e.rcvClosed || !e.state.connected() { 914 return 0, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive 915 } 916 return 0, tcpip.ControlMessages{}, tcpip.ErrWouldBlock 917 } 918 919 // Make a copy of vec so we can modify the slide headers. 920 vec = append([][]byte(nil), vec...) 921 922 var num int64 923 for s := e.rcvList.Front(); s != nil; s = s.Next() { 924 views := s.data.Views() 925 926 for i := s.viewToDeliver; i < len(views); i++ { 927 v := views[i] 928 929 for len(v) > 0 { 930 if len(vec) == 0 { 931 return num, tcpip.ControlMessages{}, nil 932 } 933 if len(vec[0]) == 0 { 934 vec = vec[1:] 935 continue 936 } 937 938 n := copy(vec[0], v) 939 v = v[n:] 940 vec[0] = vec[0][n:] 941 num += int64(n) 942 } 943 } 944 } 945 946 return num, tcpip.ControlMessages{}, nil 947 } 948 949 // zeroReceiveWindow checks if the receive window to be announced now would be 950 // zero, based on the amount of available buffer and the receive window scaling. 951 // 952 // It must be called with rcvListMu held. 953 func (e *endpoint) zeroReceiveWindow(scale uint8) bool { 954 if e.rcvBufUsed >= e.rcvBufSize { 955 return true 956 } 957 958 return ((e.rcvBufSize - e.rcvBufUsed) >> scale) == 0 959 } 960 961 // SetSockOptInt sets a socket option. 962 func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error { 963 switch opt { 964 case tcpip.ReceiveBufferSizeOption: 965 // Make sure the receive buffer size is within the min and max 966 // allowed. 967 var rs ReceiveBufferSizeOption 968 size := int(v) 969 if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil { 970 if size < rs.Min { 971 size = rs.Min 972 } 973 if size > rs.Max { 974 size = rs.Max 975 } 976 } 977 978 mask := uint32(notifyReceiveWindowChanged) 979 980 e.rcvListMu.Lock() 981 982 // Make sure the receive buffer size allows us to send a 983 // non-zero window size. 984 scale := uint8(0) 985 if e.rcv != nil { 986 scale = e.rcv.rcvWndScale 987 } 988 if size>>scale == 0 { 989 size = 1 << scale 990 } 991 992 // Make sure 2*size doesn't overflow. 993 if size > math.MaxInt32/2 { 994 size = math.MaxInt32 / 2 995 } 996 997 e.rcvBufSize = size 998 e.rcvAutoParams.disabled = true 999 if e.zeroWindow && !e.zeroReceiveWindow(scale) { 1000 e.zeroWindow = false 1001 mask |= notifyNonZeroReceiveWindow 1002 } 1003 e.rcvListMu.Unlock() 1004 1005 e.notifyProtocolGoroutine(mask) 1006 return nil 1007 1008 case tcpip.SendBufferSizeOption: 1009 // Make sure the send buffer size is within the min and max 1010 // allowed. 1011 size := int(v) 1012 var ss SendBufferSizeOption 1013 if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil { 1014 if size < ss.Min { 1015 size = ss.Min 1016 } 1017 if size > ss.Max { 1018 size = ss.Max 1019 } 1020 } 1021 1022 e.sndBufMu.Lock() 1023 e.sndBufSize = size 1024 e.sndBufMu.Unlock() 1025 return nil 1026 1027 default: 1028 return nil 1029 } 1030 } 1031 1032 // SetSockOpt sets a socket option. 1033 func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { 1034 switch v := opt.(type) { 1035 case tcpip.DelayOption: 1036 if v == 0 { 1037 atomic.StoreUint32(&e.delay, 0) 1038 1039 // Handle delayed data. 1040 e.sndWaker.Assert() 1041 } else { 1042 atomic.StoreUint32(&e.delay, 1) 1043 } 1044 return nil 1045 1046 case tcpip.CorkOption: 1047 if v == 0 { 1048 atomic.StoreUint32(&e.cork, 0) 1049 1050 // Handle the corked data. 1051 e.sndWaker.Assert() 1052 } else { 1053 atomic.StoreUint32(&e.cork, 1) 1054 } 1055 return nil 1056 1057 case tcpip.ReuseAddressOption: 1058 e.mu.Lock() 1059 e.reuseAddr = v != 0 1060 e.mu.Unlock() 1061 return nil 1062 1063 case tcpip.ReusePortOption: 1064 e.mu.Lock() 1065 e.reusePort = v != 0 1066 e.mu.Unlock() 1067 return nil 1068 1069 case tcpip.BindToDeviceOption: 1070 e.mu.Lock() 1071 defer e.mu.Unlock() 1072 if v == "" { 1073 e.bindToDevice = 0 1074 return nil 1075 } 1076 for nicid, nic := range e.stack.NICInfo() { 1077 if nic.Name == string(v) { 1078 e.bindToDevice = nicid 1079 return nil 1080 } 1081 } 1082 return tcpip.ErrUnknownDevice 1083 1084 case tcpip.QuickAckOption: 1085 if v == 0 { 1086 atomic.StoreUint32(&e.slowAck, 1) 1087 } else { 1088 atomic.StoreUint32(&e.slowAck, 0) 1089 } 1090 return nil 1091 1092 case tcpip.MaxSegOption: 1093 userMSS := v 1094 if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS { 1095 return tcpip.ErrInvalidOptionValue 1096 } 1097 e.mu.Lock() 1098 e.userMSS = int(userMSS) 1099 e.mu.Unlock() 1100 e.notifyProtocolGoroutine(notifyMSSChanged) 1101 return nil 1102 1103 case tcpip.V6OnlyOption: 1104 // We only recognize this option on v6 endpoints. 1105 if e.netProto != header.IPv6ProtocolNumber { 1106 return tcpip.ErrInvalidEndpointState 1107 } 1108 1109 e.mu.Lock() 1110 defer e.mu.Unlock() 1111 1112 // We only allow this to be set when we're in the initial state. 1113 if e.state != StateInitial { 1114 return tcpip.ErrInvalidEndpointState 1115 } 1116 1117 e.v6only = v != 0 1118 return nil 1119 1120 case tcpip.TTLOption: 1121 e.mu.Lock() 1122 e.ttl = uint8(v) 1123 e.mu.Unlock() 1124 return nil 1125 1126 case tcpip.KeepaliveEnabledOption: 1127 e.keepalive.Lock() 1128 e.keepalive.enabled = v != 0 1129 e.keepalive.Unlock() 1130 e.notifyProtocolGoroutine(notifyKeepaliveChanged) 1131 return nil 1132 1133 case tcpip.KeepaliveIdleOption: 1134 e.keepalive.Lock() 1135 e.keepalive.idle = time.Duration(v) 1136 e.keepalive.Unlock() 1137 e.notifyProtocolGoroutine(notifyKeepaliveChanged) 1138 return nil 1139 1140 case tcpip.KeepaliveIntervalOption: 1141 e.keepalive.Lock() 1142 e.keepalive.interval = time.Duration(v) 1143 e.keepalive.Unlock() 1144 e.notifyProtocolGoroutine(notifyKeepaliveChanged) 1145 return nil 1146 1147 case tcpip.KeepaliveCountOption: 1148 e.keepalive.Lock() 1149 e.keepalive.count = int(v) 1150 e.keepalive.Unlock() 1151 e.notifyProtocolGoroutine(notifyKeepaliveChanged) 1152 return nil 1153 1154 case tcpip.BroadcastOption: 1155 e.mu.Lock() 1156 e.broadcast = v != 0 1157 e.mu.Unlock() 1158 return nil 1159 1160 case tcpip.CongestionControlOption: 1161 // Query the available cc algorithms in the stack and 1162 // validate that the specified algorithm is actually 1163 // supported in the stack. 1164 var avail tcpip.AvailableCongestionControlOption 1165 if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil { 1166 return err 1167 } 1168 availCC := strings.Split(string(avail), " ") 1169 for _, cc := range availCC { 1170 if v == tcpip.CongestionControlOption(cc) { 1171 // Acquire the work mutex as we may need to 1172 // reinitialize the congestion control state. 1173 e.mu.Lock() 1174 state := e.state 1175 e.cc = v 1176 e.mu.Unlock() 1177 switch state { 1178 case StateEstablished: 1179 e.workMu.Lock() 1180 e.mu.Lock() 1181 if e.state == state { 1182 e.snd.cc = e.snd.initCongestionControl(e.cc) 1183 } 1184 e.mu.Unlock() 1185 e.workMu.Unlock() 1186 } 1187 return nil 1188 } 1189 } 1190 1191 // Linux returns ENOENT when an invalid congestion 1192 // control algorithm is specified. 1193 return tcpip.ErrNoSuchFile 1194 default: 1195 return nil 1196 } 1197 } 1198 1199 // readyReceiveSize returns the number of bytes ready to be received. 1200 func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) { 1201 e.mu.RLock() 1202 defer e.mu.RUnlock() 1203 1204 // The endpoint cannot be in listen state. 1205 if e.state == StateListen { 1206 return 0, tcpip.ErrInvalidEndpointState 1207 } 1208 1209 e.rcvListMu.Lock() 1210 defer e.rcvListMu.Unlock() 1211 1212 return e.rcvBufUsed, nil 1213 } 1214 1215 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. 1216 func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) { 1217 switch opt { 1218 case tcpip.ReceiveQueueSizeOption: 1219 return e.readyReceiveSize() 1220 case tcpip.SendBufferSizeOption: 1221 e.sndBufMu.Lock() 1222 v := e.sndBufSize 1223 e.sndBufMu.Unlock() 1224 return v, nil 1225 1226 case tcpip.ReceiveBufferSizeOption: 1227 e.rcvListMu.Lock() 1228 v := e.rcvBufSize 1229 e.rcvListMu.Unlock() 1230 return v, nil 1231 1232 } 1233 return -1, tcpip.ErrUnknownProtocolOption 1234 } 1235 1236 // GetSockOpt implements tcpip.Endpoint.GetSockOpt. 1237 func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { 1238 switch o := opt.(type) { 1239 case tcpip.ErrorOption: 1240 e.lastErrorMu.Lock() 1241 err := e.lastError 1242 e.lastError = nil 1243 e.lastErrorMu.Unlock() 1244 return err 1245 1246 case *tcpip.MaxSegOption: 1247 // This is just stubbed out. Linux never returns the user_mss 1248 // value as it either returns the defaultMSS or returns the 1249 // actual current MSS. Netstack just returns the defaultMSS 1250 // always for now. 1251 *o = header.TCPDefaultMSS 1252 return nil 1253 1254 case *tcpip.DelayOption: 1255 *o = 0 1256 if v := atomic.LoadUint32(&e.delay); v != 0 { 1257 *o = 1 1258 } 1259 return nil 1260 1261 case *tcpip.CorkOption: 1262 *o = 0 1263 if v := atomic.LoadUint32(&e.cork); v != 0 { 1264 *o = 1 1265 } 1266 return nil 1267 1268 case *tcpip.ReuseAddressOption: 1269 e.mu.RLock() 1270 v := e.reuseAddr 1271 e.mu.RUnlock() 1272 1273 *o = 0 1274 if v { 1275 *o = 1 1276 } 1277 return nil 1278 1279 case *tcpip.ReusePortOption: 1280 e.mu.RLock() 1281 v := e.reusePort 1282 e.mu.RUnlock() 1283 1284 *o = 0 1285 if v { 1286 *o = 1 1287 } 1288 return nil 1289 1290 case *tcpip.BindToDeviceOption: 1291 e.mu.RLock() 1292 defer e.mu.RUnlock() 1293 if nic, ok := e.stack.NICInfo()[e.bindToDevice]; ok { 1294 *o = tcpip.BindToDeviceOption(nic.Name) 1295 return nil 1296 } 1297 *o = "" 1298 return nil 1299 1300 case *tcpip.QuickAckOption: 1301 *o = 1 1302 if v := atomic.LoadUint32(&e.slowAck); v != 0 { 1303 *o = 0 1304 } 1305 return nil 1306 1307 case *tcpip.V6OnlyOption: 1308 // We only recognize this option on v6 endpoints. 1309 if e.netProto != header.IPv6ProtocolNumber { 1310 return tcpip.ErrUnknownProtocolOption 1311 } 1312 1313 e.mu.Lock() 1314 v := e.v6only 1315 e.mu.Unlock() 1316 1317 *o = 0 1318 if v { 1319 *o = 1 1320 } 1321 return nil 1322 1323 case *tcpip.TTLOption: 1324 e.mu.Lock() 1325 *o = tcpip.TTLOption(e.ttl) 1326 e.mu.Unlock() 1327 return nil 1328 1329 case *tcpip.TCPInfoOption: 1330 *o = tcpip.TCPInfoOption{} 1331 e.mu.RLock() 1332 snd := e.snd 1333 e.mu.RUnlock() 1334 if snd != nil { 1335 snd.rtt.Lock() 1336 o.RTT = snd.rtt.srtt 1337 o.RTTVar = snd.rtt.rttvar 1338 snd.rtt.Unlock() 1339 } 1340 return nil 1341 1342 case *tcpip.KeepaliveEnabledOption: 1343 e.keepalive.Lock() 1344 v := e.keepalive.enabled 1345 e.keepalive.Unlock() 1346 1347 *o = 0 1348 if v { 1349 *o = 1 1350 } 1351 return nil 1352 1353 case *tcpip.KeepaliveIdleOption: 1354 e.keepalive.Lock() 1355 *o = tcpip.KeepaliveIdleOption(e.keepalive.idle) 1356 e.keepalive.Unlock() 1357 return nil 1358 1359 case *tcpip.KeepaliveIntervalOption: 1360 e.keepalive.Lock() 1361 *o = tcpip.KeepaliveIntervalOption(e.keepalive.interval) 1362 e.keepalive.Unlock() 1363 return nil 1364 1365 case *tcpip.KeepaliveCountOption: 1366 e.keepalive.Lock() 1367 *o = tcpip.KeepaliveCountOption(e.keepalive.count) 1368 e.keepalive.Unlock() 1369 return nil 1370 1371 case *tcpip.OutOfBandInlineOption: 1372 // We don't currently support disabling this option. 1373 *o = 1 1374 return nil 1375 1376 case *tcpip.BroadcastOption: 1377 e.mu.Lock() 1378 v := e.broadcast 1379 e.mu.Unlock() 1380 1381 *o = 0 1382 if v { 1383 *o = 1 1384 } 1385 return nil 1386 1387 case *tcpip.CongestionControlOption: 1388 e.mu.Lock() 1389 *o = e.cc 1390 e.mu.Unlock() 1391 return nil 1392 1393 default: 1394 return tcpip.ErrUnknownProtocolOption 1395 } 1396 } 1397 1398 func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) { 1399 netProto := e.netProto 1400 if header.IsV4MappedAddress(addr.Addr) { 1401 // Fail if using a v4 mapped address on a v6only endpoint. 1402 if e.v6only { 1403 return 0, tcpip.ErrNoRoute 1404 } 1405 1406 netProto = header.IPv4ProtocolNumber 1407 addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:] 1408 if addr.Addr == header.IPv4Any { 1409 addr.Addr = "" 1410 } 1411 } 1412 1413 // Fail if we're bound to an address length different from the one we're 1414 // checking. 1415 if l := len(e.id.LocalAddress); l != 0 && len(addr.Addr) != 0 && l != len(addr.Addr) { 1416 return 0, tcpip.ErrInvalidEndpointState 1417 } 1418 1419 return netProto, nil 1420 } 1421 1422 // Disconnect implements tcpip.Endpoint.Disconnect. 1423 func (*endpoint) Disconnect() *tcpip.Error { 1424 return tcpip.ErrNotSupported 1425 } 1426 1427 // Connect connects the endpoint to its peer. 1428 func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { 1429 return e.connect(addr, true, true) 1430 } 1431 1432 // connect connects the endpoint to its peer. In the normal non-S/R case, the 1433 // new connection is expected to run the main goroutine and perform handshake. 1434 // In restore of previously connected endpoints, both ends will be passively 1435 // created (so no new handshaking is done); for stack-accepted connections not 1436 // yet accepted by the app, they are restored without running the main goroutine 1437 // here. 1438 func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (err *tcpip.Error) { 1439 e.mu.Lock() 1440 defer e.mu.Unlock() 1441 defer func() { 1442 if err != nil && !err.IgnoreStats() { 1443 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 1444 } 1445 }() 1446 1447 connectingAddr := addr.Addr 1448 1449 netProto, err := e.checkV4Mapped(&addr) 1450 if err != nil { 1451 return err 1452 } 1453 1454 if e.state.connected() { 1455 // The endpoint is already connected. If caller hasn't been 1456 // notified yet, return success. 1457 if !e.isConnectNotified { 1458 e.isConnectNotified = true 1459 return nil 1460 } 1461 // Otherwise return that it's already connected. 1462 return tcpip.ErrAlreadyConnected 1463 } 1464 1465 nicid := addr.NIC 1466 switch e.state { 1467 case StateBound: 1468 // If we're already bound to a NIC but the caller is requesting 1469 // that we use a different one now, we cannot proceed. 1470 if e.boundNICID == 0 { 1471 break 1472 } 1473 1474 if nicid != 0 && nicid != e.boundNICID { 1475 return tcpip.ErrNoRoute 1476 } 1477 1478 nicid = e.boundNICID 1479 1480 case StateInitial: 1481 // Nothing to do. We'll eventually fill-in the gaps in the ID (if any) 1482 // when we find a route. 1483 1484 case StateConnecting, StateSynSent, StateSynRecv: 1485 // A connection request has already been issued but hasn't completed 1486 // yet. 1487 return tcpip.ErrAlreadyConnecting 1488 1489 case StateError: 1490 return e.hardError 1491 1492 default: 1493 return tcpip.ErrInvalidEndpointState 1494 } 1495 1496 // Find a route to the desired destination. 1497 r, err := e.stack.FindRoute(nicid, e.id.LocalAddress, addr.Addr, netProto, false /* multicastLoop */) 1498 if err != nil { 1499 return err 1500 } 1501 defer r.Release() 1502 1503 origID := e.id 1504 1505 netProtos := []tcpip.NetworkProtocolNumber{netProto} 1506 e.id.LocalAddress = r.LocalAddress 1507 e.id.RemoteAddress = r.RemoteAddress 1508 e.id.RemotePort = addr.Port 1509 1510 if e.id.LocalPort != 0 { 1511 // The endpoint is bound to a port, attempt to register it. 1512 err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, e.id, e, e.reusePort, e.bindToDevice) 1513 if err != nil { 1514 return err 1515 } 1516 } else { 1517 // The endpoint doesn't have a local port yet, so try to get 1518 // one. Make sure that it isn't one that will result in the same 1519 // address/port for both local and remote (otherwise this 1520 // endpoint would be trying to connect to itself). 1521 sameAddr := e.id.LocalAddress == e.id.RemoteAddress 1522 1523 // Calculate a port offset based on the destination IP/port and 1524 // src IP to ensure that for a given tuple (srcIP, destIP, 1525 // destPort) the offset used as a starting point is the same to 1526 // ensure that we can cycle through the port space effectively. 1527 h := jenkins.Sum32(e.stack.PortSeed()) 1528 h.Write([]byte(e.id.LocalAddress)) 1529 h.Write([]byte(e.id.RemoteAddress)) 1530 portBuf := make([]byte, 2) 1531 binary.LittleEndian.PutUint16(portBuf, e.id.RemotePort) 1532 h.Write(portBuf) 1533 portOffset := h.Sum32() 1534 1535 if _, err := e.stack.PickEphemeralPortStable(portOffset, func(p uint16) (bool, *tcpip.Error) { 1536 if sameAddr && p == e.id.RemotePort { 1537 return false, nil 1538 } 1539 // reusePort is false below because connect cannot reuse a port even if 1540 // reusePort was set. 1541 if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.id.LocalAddress, p, false /* reusePort */, e.bindToDevice) { 1542 return false, nil 1543 } 1544 1545 id := e.id 1546 id.LocalPort = p 1547 switch e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice) { 1548 case nil: 1549 e.id = id 1550 return true, nil 1551 case tcpip.ErrPortInUse: 1552 return false, nil 1553 default: 1554 return false, err 1555 } 1556 }); err != nil { 1557 return err 1558 } 1559 } 1560 1561 // Remove the port reservation. This can happen when Bind is called 1562 // before Connect: in such a case we don't want to hold on to 1563 // reservations anymore. 1564 if e.isPortReserved { 1565 e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.bindToDevice) 1566 e.isPortReserved = false 1567 } 1568 1569 e.isRegistered = true 1570 e.state = StateConnecting 1571 e.route = r.Clone() 1572 e.boundNICID = nicid 1573 e.effectiveNetProtos = netProtos 1574 e.connectingAddress = connectingAddr 1575 1576 e.initGSO() 1577 1578 // Connect in the restore phase does not perform handshake. Restore its 1579 // connection setting here. 1580 if !handshake { 1581 e.segmentQueue.mu.Lock() 1582 for _, l := range []segmentList{e.segmentQueue.list, e.sndQueue, e.snd.writeList} { 1583 for s := l.Front(); s != nil; s = s.Next() { 1584 s.id = e.id 1585 s.route = r.Clone() 1586 e.sndWaker.Assert() 1587 } 1588 } 1589 e.segmentQueue.mu.Unlock() 1590 e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0) 1591 e.state = StateEstablished 1592 } 1593 1594 if run { 1595 e.workerRunning = true 1596 e.stack.Stats().TCP.ActiveConnectionOpenings.Increment() 1597 go e.protocolMainLoop(handshake) 1598 } 1599 1600 return tcpip.ErrConnectStarted 1601 } 1602 1603 // ConnectEndpoint is not supported. 1604 func (*endpoint) ConnectEndpoint(tcpip.Endpoint) *tcpip.Error { 1605 return tcpip.ErrInvalidEndpointState 1606 } 1607 1608 // Shutdown closes the read and/or write end of the endpoint connection to its 1609 // peer. 1610 func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error { 1611 e.mu.Lock() 1612 defer e.mu.Unlock() 1613 e.shutdownFlags |= flags 1614 1615 switch { 1616 case e.state.connected(): 1617 // Close for read. 1618 if (e.shutdownFlags & tcpip.ShutdownRead) != 0 { 1619 // Mark read side as closed. 1620 e.rcvListMu.Lock() 1621 e.rcvClosed = true 1622 rcvBufUsed := e.rcvBufUsed 1623 e.rcvListMu.Unlock() 1624 1625 // If we're fully closed and we have unread data we need to abort 1626 // the connection with a RST. 1627 if (e.shutdownFlags&tcpip.ShutdownWrite) != 0 && rcvBufUsed > 0 { 1628 e.notifyProtocolGoroutine(notifyReset) 1629 return nil 1630 } 1631 } 1632 1633 // Close for write. 1634 if (e.shutdownFlags & tcpip.ShutdownWrite) != 0 { 1635 e.sndBufMu.Lock() 1636 1637 if e.sndClosed { 1638 // Already closed. 1639 e.sndBufMu.Unlock() 1640 break 1641 } 1642 1643 // Queue fin segment. 1644 s := newSegmentFromView(&e.route, e.id, nil) 1645 e.sndQueue.PushBack(s) 1646 e.sndBufInQueue++ 1647 1648 // Mark endpoint as closed. 1649 e.sndClosed = true 1650 1651 e.sndBufMu.Unlock() 1652 1653 // Tell protocol goroutine to close. 1654 e.sndCloseWaker.Assert() 1655 } 1656 1657 case e.state == StateListen: 1658 // Tell protocolListenLoop to stop. 1659 if flags&tcpip.ShutdownRead != 0 { 1660 e.notifyProtocolGoroutine(notifyClose) 1661 } 1662 1663 default: 1664 return tcpip.ErrNotConnected 1665 } 1666 1667 return nil 1668 } 1669 1670 // Listen puts the endpoint in "listen" mode, which allows it to accept 1671 // new connections. 1672 func (e *endpoint) Listen(backlog int) (err *tcpip.Error) { 1673 e.mu.Lock() 1674 defer e.mu.Unlock() 1675 defer func() { 1676 if err != nil && !err.IgnoreStats() { 1677 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 1678 } 1679 }() 1680 1681 // Allow the backlog to be adjusted if the endpoint is not shutting down. 1682 // When the endpoint shuts down, it sets workerCleanup to true, and from 1683 // that point onward, acceptedChan is the responsibility of the cleanup() 1684 // method (and should not be touched anywhere else, including here). 1685 if e.state == StateListen && !e.workerCleanup { 1686 // Adjust the size of the channel iff we can fix existing 1687 // pending connections into the new one. 1688 if len(e.acceptedChan) > backlog { 1689 return tcpip.ErrInvalidEndpointState 1690 } 1691 if cap(e.acceptedChan) == backlog { 1692 return nil 1693 } 1694 origChan := e.acceptedChan 1695 e.acceptedChan = make(chan *endpoint, backlog) 1696 close(origChan) 1697 for ep := range origChan { 1698 e.acceptedChan <- ep 1699 } 1700 return nil 1701 } 1702 1703 // Endpoint must be bound before it can transition to listen mode. 1704 if e.state != StateBound { 1705 return tcpip.ErrInvalidEndpointState 1706 } 1707 1708 // Register the endpoint. 1709 if err := e.stack.RegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e, e.reusePort, e.bindToDevice); err != nil { 1710 return err 1711 } 1712 1713 e.isRegistered = true 1714 e.state = StateListen 1715 if e.acceptedChan == nil { 1716 e.acceptedChan = make(chan *endpoint, backlog) 1717 } 1718 e.workerRunning = true 1719 1720 go e.protocolListenLoop( 1721 seqnum.Size(e.receiveBufferAvailable())) 1722 1723 return nil 1724 } 1725 1726 // startAcceptedLoop sets up required state and starts a goroutine with the 1727 // main loop for accepted connections. 1728 func (e *endpoint) startAcceptedLoop(waiterQueue *waiter.Queue) { 1729 e.waiterQueue = waiterQueue 1730 e.workerRunning = true 1731 go e.protocolMainLoop(false) 1732 } 1733 1734 // Accept returns a new endpoint if a peer has established a connection 1735 // to an endpoint previously set to listen mode. 1736 func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) { 1737 e.mu.RLock() 1738 defer e.mu.RUnlock() 1739 1740 // Endpoint must be in listen state before it can accept connections. 1741 if e.state != StateListen { 1742 return nil, nil, tcpip.ErrInvalidEndpointState 1743 } 1744 1745 // Get the new accepted endpoint. 1746 var n *endpoint 1747 select { 1748 case n = <-e.acceptedChan: 1749 default: 1750 return nil, nil, tcpip.ErrWouldBlock 1751 } 1752 1753 // Start the protocol goroutine. 1754 wq := &waiter.Queue{} 1755 n.startAcceptedLoop(wq) 1756 e.stack.Stats().TCP.PassiveConnectionOpenings.Increment() 1757 1758 return n, wq, nil 1759 } 1760 1761 // Bind binds the endpoint to a specific local port and optionally address. 1762 func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) { 1763 e.mu.Lock() 1764 defer e.mu.Unlock() 1765 1766 // Don't allow binding once endpoint is not in the initial state 1767 // anymore. This is because once the endpoint goes into a connected or 1768 // listen state, it is already bound. 1769 if e.state != StateInitial { 1770 return tcpip.ErrAlreadyBound 1771 } 1772 1773 e.bindAddress = addr.Addr 1774 netProto, err := e.checkV4Mapped(&addr) 1775 if err != nil { 1776 return err 1777 } 1778 1779 // Expand netProtos to include v4 and v6 if the caller is binding to a 1780 // wildcard (empty) address, and this is an IPv6 endpoint with v6only 1781 // set to false. 1782 netProtos := []tcpip.NetworkProtocolNumber{netProto} 1783 if netProto == header.IPv6ProtocolNumber && !e.v6only && addr.Addr == "" { 1784 netProtos = []tcpip.NetworkProtocolNumber{ 1785 header.IPv6ProtocolNumber, 1786 header.IPv4ProtocolNumber, 1787 } 1788 } 1789 1790 port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.reusePort, e.bindToDevice) 1791 if err != nil { 1792 return err 1793 } 1794 1795 e.isPortReserved = true 1796 e.effectiveNetProtos = netProtos 1797 e.id.LocalPort = port 1798 1799 // Any failures beyond this point must remove the port registration. 1800 defer func(bindToDevice tcpip.NICID) { 1801 if err != nil { 1802 e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, bindToDevice) 1803 e.isPortReserved = false 1804 e.effectiveNetProtos = nil 1805 e.id.LocalPort = 0 1806 e.id.LocalAddress = "" 1807 e.boundNICID = 0 1808 } 1809 }(e.bindToDevice) 1810 1811 // If an address is specified, we must ensure that it's one of our 1812 // local addresses. 1813 if len(addr.Addr) != 0 { 1814 nic := e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) 1815 if nic == 0 { 1816 return tcpip.ErrBadLocalAddress 1817 } 1818 1819 e.boundNICID = nic 1820 e.id.LocalAddress = addr.Addr 1821 } 1822 1823 // Mark endpoint as bound. 1824 e.state = StateBound 1825 1826 return nil 1827 } 1828 1829 // GetLocalAddress returns the address to which the endpoint is bound. 1830 func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { 1831 e.mu.RLock() 1832 defer e.mu.RUnlock() 1833 1834 return tcpip.FullAddress{ 1835 Addr: e.id.LocalAddress, 1836 Port: e.id.LocalPort, 1837 NIC: e.boundNICID, 1838 }, nil 1839 } 1840 1841 // GetRemoteAddress returns the address to which the endpoint is connected. 1842 func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { 1843 e.mu.RLock() 1844 defer e.mu.RUnlock() 1845 1846 if !e.state.connected() { 1847 return tcpip.FullAddress{}, tcpip.ErrNotConnected 1848 } 1849 1850 return tcpip.FullAddress{ 1851 Addr: e.id.RemoteAddress, 1852 Port: e.id.RemotePort, 1853 NIC: e.boundNICID, 1854 }, nil 1855 } 1856 1857 // HandlePacket is called by the stack when new packets arrive to this transport 1858 // endpoint. 1859 func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) { 1860 s := newSegment(r, id, vv) 1861 if !s.parse() { 1862 e.stack.Stats().MalformedRcvdPackets.Increment() 1863 e.stack.Stats().TCP.InvalidSegmentsReceived.Increment() 1864 s.decRef() 1865 return 1866 } 1867 1868 if !s.csumValid { 1869 e.stack.Stats().MalformedRcvdPackets.Increment() 1870 e.stack.Stats().TCP.ChecksumErrors.Increment() 1871 s.decRef() 1872 return 1873 } 1874 1875 e.stack.Stats().TCP.ValidSegmentsReceived.Increment() 1876 if (s.flags & header.TCPFlagRst) != 0 { 1877 e.stack.Stats().TCP.ResetsReceived.Increment() 1878 } 1879 1880 // Send packet to worker goroutine. 1881 if e.segmentQueue.enqueue(s) { 1882 e.newSegmentWaker.Assert() 1883 } else { 1884 // The queue is full, so we drop the segment. 1885 e.stack.Stats().DroppedPackets.Increment() 1886 s.decRef() 1887 } 1888 } 1889 1890 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket. 1891 func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) { 1892 switch typ { 1893 case stack.ControlPacketTooBig: 1894 e.sndBufMu.Lock() 1895 e.packetTooBigCount++ 1896 if v := int(extra); v < e.sndMTU { 1897 e.sndMTU = v 1898 } 1899 e.sndBufMu.Unlock() 1900 1901 e.notifyProtocolGoroutine(notifyMTUChanged) 1902 } 1903 } 1904 1905 // updateSndBufferUsage is called by the protocol goroutine when room opens up 1906 // in the send buffer. The number of newly available bytes is v. 1907 func (e *endpoint) updateSndBufferUsage(v int) { 1908 e.sndBufMu.Lock() 1909 notify := e.sndBufUsed >= e.sndBufSize>>1 1910 e.sndBufUsed -= v 1911 // We only notify when there is half the sndBufSize available after 1912 // a full buffer event occurs. This ensures that we don't wake up 1913 // writers to queue just 1-2 segments and go back to sleep. 1914 notify = notify && e.sndBufUsed < e.sndBufSize>>1 1915 e.sndBufMu.Unlock() 1916 1917 if notify { 1918 e.waiterQueue.Notify(waiter.EventOut) 1919 } 1920 } 1921 1922 // readyToRead is called by the protocol goroutine when a new segment is ready 1923 // to be read, or when the connection is closed for receiving (in which case 1924 // s will be nil). 1925 func (e *endpoint) readyToRead(s *segment) { 1926 e.rcvListMu.Lock() 1927 if s != nil { 1928 s.incRef() 1929 e.rcvBufUsed += s.data.Size() 1930 // Check if the receive window is now closed. If so make sure 1931 // we set the zero window before we deliver the segment to ensure 1932 // that a subsequent read of the segment will correctly trigger 1933 // a non-zero notification. 1934 if avail := e.receiveBufferAvailableLocked(); avail>>e.rcv.rcvWndScale == 0 { 1935 e.zeroWindow = true 1936 } 1937 e.rcvList.PushBack(s) 1938 } else { 1939 e.rcvClosed = true 1940 } 1941 e.rcvListMu.Unlock() 1942 1943 e.waiterQueue.Notify(waiter.EventIn) 1944 } 1945 1946 // receiveBufferAvailableLocked calculates how many bytes are still available 1947 // in the receive buffer. 1948 // rcvListMu must be held when this function is called. 1949 func (e *endpoint) receiveBufferAvailableLocked() int { 1950 // We may use more bytes than the buffer size when the receive buffer 1951 // shrinks. 1952 if e.rcvBufUsed >= e.rcvBufSize { 1953 return 0 1954 } 1955 1956 return e.rcvBufSize - e.rcvBufUsed 1957 } 1958 1959 // receiveBufferAvailable calculates how many bytes are still available in the 1960 // receive buffer. 1961 func (e *endpoint) receiveBufferAvailable() int { 1962 e.rcvListMu.Lock() 1963 available := e.receiveBufferAvailableLocked() 1964 e.rcvListMu.Unlock() 1965 return available 1966 } 1967 1968 func (e *endpoint) receiveBufferSize() int { 1969 e.rcvListMu.Lock() 1970 size := e.rcvBufSize 1971 e.rcvListMu.Unlock() 1972 1973 return size 1974 } 1975 1976 func (e *endpoint) maxReceiveBufferSize() int { 1977 var rs ReceiveBufferSizeOption 1978 if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil { 1979 // As a fallback return the hardcoded max buffer size. 1980 return MaxBufferSize 1981 } 1982 return rs.Max 1983 } 1984 1985 // rcvWndScaleForHandshake computes the receive window scale to offer to the 1986 // peer when window scaling is enabled (true by default). If auto-tuning is 1987 // disabled then the window scaling factor is based on the size of the 1988 // receiveBuffer otherwise we use the max permissible receive buffer size to 1989 // compute the scale. 1990 func (e *endpoint) rcvWndScaleForHandshake() int { 1991 bufSizeForScale := e.receiveBufferSize() 1992 1993 e.rcvListMu.Lock() 1994 autoTuningDisabled := e.rcvAutoParams.disabled 1995 e.rcvListMu.Unlock() 1996 if autoTuningDisabled { 1997 return FindWndScale(seqnum.Size(bufSizeForScale)) 1998 } 1999 2000 return FindWndScale(seqnum.Size(e.maxReceiveBufferSize())) 2001 } 2002 2003 // updateRecentTimestamp updates the recent timestamp using the algorithm 2004 // described in https://tools.ietf.org/html/rfc7323#section-4.3 2005 func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) { 2006 if e.sendTSOk && seqnum.Value(e.recentTS).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) { 2007 e.recentTS = tsVal 2008 } 2009 } 2010 2011 // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if 2012 // the SYN options indicate that timestamp option was negotiated. It also 2013 // initializes the recentTS with the value provided in synOpts.TSval. 2014 func (e *endpoint) maybeEnableTimestamp(synOpts *header.TCPSynOptions) { 2015 if synOpts.TS { 2016 e.sendTSOk = true 2017 e.recentTS = synOpts.TSVal 2018 } 2019 } 2020 2021 // timestamp returns the timestamp value to be used in the TSVal field of the 2022 // timestamp option for outgoing TCP segments for a given endpoint. 2023 func (e *endpoint) timestamp() uint32 { 2024 return tcpTimeStamp(e.tsOffset) 2025 } 2026 2027 // tcpTimeStamp returns a timestamp offset by the provided offset. This is 2028 // not inlined above as it's used when SYN cookies are in use and endpoint 2029 // is not created at the time when the SYN cookie is sent. 2030 func tcpTimeStamp(offset uint32) uint32 { 2031 now := time.Now() 2032 return uint32(now.Unix()*1000+int64(now.Nanosecond()/1e6)) + offset 2033 } 2034 2035 // timeStampOffset returns a randomized timestamp offset to be used when sending 2036 // timestamp values in a timestamp option for a TCP segment. 2037 func timeStampOffset() uint32 { 2038 b := make([]byte, 4) 2039 if _, err := rand.Read(b); err != nil { 2040 panic(err) 2041 } 2042 // Initialize a random tsOffset that will be added to the recentTS 2043 // everytime the timestamp is sent when the Timestamp option is enabled. 2044 // 2045 // See https://tools.ietf.org/html/rfc7323#section-5.4 for details on 2046 // why this is required. 2047 // 2048 // NOTE: This is not completely to spec as normally this should be 2049 // initialized in a manner analogous to how sequence numbers are 2050 // randomized per connection basis. But for now this is sufficient. 2051 return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 2052 } 2053 2054 // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint 2055 // if the SYN options indicate that the SACK option was negotiated and the TCP 2056 // stack is configured to enable TCP SACK option. 2057 func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) { 2058 var v SACKEnabled 2059 if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil { 2060 // Stack doesn't support SACK. So just return. 2061 return 2062 } 2063 if bool(v) && synOpts.SACKPermitted { 2064 e.sackPermitted = true 2065 } 2066 } 2067 2068 // maxOptionSize return the maximum size of TCP options. 2069 func (e *endpoint) maxOptionSize() (size int) { 2070 var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock 2071 options := e.makeOptions(maxSackBlocks[:]) 2072 size = len(options) 2073 putOptions(options) 2074 2075 return size 2076 } 2077 2078 // completeState makes a full copy of the endpoint and returns it. This is used 2079 // before invoking the probe. The state returned may not be fully consistent if 2080 // there are intervening syscalls when the state is being copied. 2081 func (e *endpoint) completeState() stack.TCPEndpointState { 2082 var s stack.TCPEndpointState 2083 s.SegTime = time.Now() 2084 2085 // Copy EndpointID. 2086 e.mu.Lock() 2087 s.ID = stack.TCPEndpointID(e.id) 2088 e.mu.Unlock() 2089 2090 // Copy endpoint rcv state. 2091 e.rcvListMu.Lock() 2092 s.RcvBufSize = e.rcvBufSize 2093 s.RcvBufUsed = e.rcvBufUsed 2094 s.RcvClosed = e.rcvClosed 2095 s.RcvAutoParams.MeasureTime = e.rcvAutoParams.measureTime 2096 s.RcvAutoParams.CopiedBytes = e.rcvAutoParams.copied 2097 s.RcvAutoParams.PrevCopiedBytes = e.rcvAutoParams.prevCopied 2098 s.RcvAutoParams.RTT = e.rcvAutoParams.rtt 2099 s.RcvAutoParams.RTTMeasureSeqNumber = e.rcvAutoParams.rttMeasureSeqNumber 2100 s.RcvAutoParams.RTTMeasureTime = e.rcvAutoParams.rttMeasureTime 2101 s.RcvAutoParams.Disabled = e.rcvAutoParams.disabled 2102 e.rcvListMu.Unlock() 2103 2104 // Endpoint TCP Option state. 2105 s.SendTSOk = e.sendTSOk 2106 s.RecentTS = e.recentTS 2107 s.TSOffset = e.tsOffset 2108 s.SACKPermitted = e.sackPermitted 2109 s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks) 2110 copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks]) 2111 s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy() 2112 2113 // Copy endpoint send state. 2114 e.sndBufMu.Lock() 2115 s.SndBufSize = e.sndBufSize 2116 s.SndBufUsed = e.sndBufUsed 2117 s.SndClosed = e.sndClosed 2118 s.SndBufInQueue = e.sndBufInQueue 2119 s.PacketTooBigCount = e.packetTooBigCount 2120 s.SndMTU = e.sndMTU 2121 e.sndBufMu.Unlock() 2122 2123 // Copy receiver state. 2124 s.Receiver = stack.TCPReceiverState{ 2125 RcvNxt: e.rcv.rcvNxt, 2126 RcvAcc: e.rcv.rcvAcc, 2127 RcvWndScale: e.rcv.rcvWndScale, 2128 PendingBufUsed: e.rcv.pendingBufUsed, 2129 PendingBufSize: e.rcv.pendingBufSize, 2130 } 2131 2132 // Copy sender state. 2133 s.Sender = stack.TCPSenderState{ 2134 LastSendTime: e.snd.lastSendTime, 2135 DupAckCount: e.snd.dupAckCount, 2136 FastRecovery: stack.TCPFastRecoveryState{ 2137 Active: e.snd.fr.active, 2138 First: e.snd.fr.first, 2139 Last: e.snd.fr.last, 2140 MaxCwnd: e.snd.fr.maxCwnd, 2141 HighRxt: e.snd.fr.highRxt, 2142 RescueRxt: e.snd.fr.rescueRxt, 2143 }, 2144 SndCwnd: e.snd.sndCwnd, 2145 Ssthresh: e.snd.sndSsthresh, 2146 SndCAAckCount: e.snd.sndCAAckCount, 2147 Outstanding: e.snd.outstanding, 2148 SndWnd: e.snd.sndWnd, 2149 SndUna: e.snd.sndUna, 2150 SndNxt: e.snd.sndNxt, 2151 RTTMeasureSeqNum: e.snd.rttMeasureSeqNum, 2152 RTTMeasureTime: e.snd.rttMeasureTime, 2153 Closed: e.snd.closed, 2154 RTO: e.snd.rto, 2155 MaxPayloadSize: e.snd.maxPayloadSize, 2156 SndWndScale: e.snd.sndWndScale, 2157 MaxSentAck: e.snd.maxSentAck, 2158 } 2159 e.snd.rtt.Lock() 2160 s.Sender.SRTT = e.snd.rtt.srtt 2161 s.Sender.SRTTInited = e.snd.rtt.srttInited 2162 e.snd.rtt.Unlock() 2163 2164 if cubic, ok := e.snd.cc.(*cubicState); ok { 2165 s.Sender.Cubic = stack.TCPCubicState{ 2166 WMax: cubic.wMax, 2167 WLastMax: cubic.wLastMax, 2168 T: cubic.t, 2169 TimeSinceLastCongestion: time.Since(cubic.t), 2170 C: cubic.c, 2171 K: cubic.k, 2172 Beta: cubic.beta, 2173 WC: cubic.wC, 2174 WEst: cubic.wEst, 2175 } 2176 } 2177 return s 2178 } 2179 2180 func (e *endpoint) initGSO() { 2181 if e.route.Capabilities()&stack.CapabilityGSO == 0 { 2182 return 2183 } 2184 2185 gso := &stack.GSO{} 2186 switch e.route.NetProto { 2187 case header.IPv4ProtocolNumber: 2188 gso.Type = stack.GSOTCPv4 2189 gso.L3HdrLen = header.IPv4MinimumSize 2190 case header.IPv6ProtocolNumber: 2191 gso.Type = stack.GSOTCPv6 2192 gso.L3HdrLen = header.IPv6MinimumSize 2193 default: 2194 panic(fmt.Sprintf("Unknown netProto: %v", e.netProto)) 2195 } 2196 gso.NeedsCsum = true 2197 gso.CsumOffset = header.TCPChecksumOffset 2198 gso.MaxSize = e.route.GSOMaxSize() 2199 e.gso = gso 2200 } 2201 2202 // State implements tcpip.Endpoint.State. It exports the endpoint's protocol 2203 // state for diagnostics. 2204 func (e *endpoint) State() uint32 { 2205 e.mu.Lock() 2206 defer e.mu.Unlock() 2207 return uint32(e.state) 2208 } 2209 2210 func mssForRoute(r *stack.Route) uint16 { 2211 return uint16(r.MTU() - header.TCPMinimumSize) 2212 }