gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/tcpip/transport/tcp/connect.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "crypto/sha256" 19 "encoding/binary" 20 "fmt" 21 "math" 22 "time" 23 24 "gvisor.dev/gvisor/pkg/sync" 25 "gvisor.dev/gvisor/pkg/tcpip" 26 "gvisor.dev/gvisor/pkg/tcpip/checksum" 27 "gvisor.dev/gvisor/pkg/tcpip/header" 28 "gvisor.dev/gvisor/pkg/tcpip/seqnum" 29 "gvisor.dev/gvisor/pkg/tcpip/stack" 30 "gvisor.dev/gvisor/pkg/waiter" 31 ) 32 33 // InitialRTO is the initial retransmission timeout. 34 // https://github.com/torvalds/linux/blob/7c636d4d20f/include/net/tcp.h#L142 35 const InitialRTO = time.Second 36 37 // maxSegmentsPerWake is the maximum number of segments to process in the main 38 // protocol goroutine per wake-up. Yielding [after this number of segments are 39 // processed] allows other events to be processed as well (e.g., timeouts, 40 // resets, etc.). 41 const maxSegmentsPerWake = 100 42 43 type handshakeState int 44 45 // The following are the possible states of the TCP connection during a 3-way 46 // handshake. A depiction of the states and transitions can be found in RFC 793, 47 // page 23. 48 const ( 49 handshakeSynSent handshakeState = iota 50 handshakeSynRcvd 51 handshakeCompleted 52 ) 53 54 const ( 55 // Maximum space available for options. 56 maxOptionSize = 40 57 ) 58 59 // handshake holds the state used during a TCP 3-way handshake. 60 // 61 // NOTE: handshake.ep.mu is held during handshake processing. It is released if 62 // we are going to block and reacquired when we start processing an event. 63 // 64 // +stateify savable 65 type handshake struct { 66 ep *Endpoint 67 listenEP *Endpoint 68 state handshakeState 69 active bool 70 flags header.TCPFlags 71 ackNum seqnum.Value 72 73 // iss is the initial send sequence number, as defined in RFC 793. 74 iss seqnum.Value 75 76 // rcvWnd is the receive window, as defined in RFC 793. 77 rcvWnd seqnum.Size 78 79 // sndWnd is the send window, as defined in RFC 793. 80 sndWnd seqnum.Size 81 82 // mss is the maximum segment size received from the peer. 83 mss uint16 84 85 // sndWndScale is the send window scale, as defined in RFC 1323. A 86 // negative value means no scaling is supported by the peer. 87 sndWndScale int 88 89 // rcvWndScale is the receive window scale, as defined in RFC 1323. 90 rcvWndScale int 91 92 // startTime is the time at which the first SYN/SYN-ACK was sent. 93 startTime tcpip.MonotonicTime 94 95 // deferAccept if non-zero will drop the final ACK for a passive 96 // handshake till an ACK segment with data is received or the timeout is 97 // hit. 98 deferAccept time.Duration 99 100 // acked is true if the final ACK for a 3-way handshake has 101 // been received. This is required to stop retransmitting the 102 // original SYN-ACK when deferAccept is enabled. 103 acked bool 104 105 // sendSYNOpts is the cached values for the SYN options to be sent. 106 sendSYNOpts header.TCPSynOptions 107 108 // sampleRTTWithTSOnly is true when the segment was retransmitted or we can't 109 // tell; then RTT can only be sampled when the incoming segment has timestamp 110 // options enabled. 111 sampleRTTWithTSOnly bool 112 113 // retransmitTimer is used to retransmit SYN/SYN-ACK with exponential backoff 114 // till handshake is either completed or timesout. 115 retransmitTimer *backoffTimer `state:"nosave"` 116 } 117 118 // timerHandler takes a handler function for a timer and returns a function that 119 // will invoke the provided handler with the endpoint mutex held. In addition 120 // the returned function will perform any cleanup that may be required if the 121 // timer handler returns an error. In the case of no errors it will notify the 122 // processor if there are pending segments that need to be processed. 123 // 124 // NOTE: e.mu is held for the duration of the call to f(). 125 func timerHandler(e *Endpoint, f func() tcpip.Error) func() { 126 return func() { 127 e.mu.Lock() 128 if err := f(); err != nil { 129 e.lastErrorMu.Lock() 130 // If the handler timed out and we have a lastError recorded (maybe due 131 // to an ICMP message received), promote it to be the hard error. 132 if _, isTimeout := err.(*tcpip.ErrTimeout); e.lastError != nil && isTimeout { 133 e.hardError = e.lastError 134 } else { 135 e.hardError = err 136 } 137 e.lastError = err 138 e.lastErrorMu.Unlock() 139 e.cleanupLocked() 140 e.setEndpointState(StateError) 141 e.mu.Unlock() 142 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 143 return 144 } 145 processor := e.protocol.dispatcher.selectProcessor(e.ID) 146 e.mu.Unlock() 147 148 // notify processor if there are pending segments to be 149 // processed. 150 if !e.segmentQueue.empty() { 151 processor.queueEndpoint(e) 152 } 153 } 154 } 155 156 // +checklocks:e.mu 157 // +checklocksacquire:h.ep.mu 158 func (e *Endpoint) newHandshake() (h *handshake) { 159 h = &handshake{ 160 ep: e, 161 active: true, 162 rcvWnd: seqnum.Size(e.initialReceiveWindow()), 163 rcvWndScale: e.rcvWndScaleForHandshake(), 164 } 165 h.ep.AssertLockHeld(e) 166 h.resetState() 167 // Store reference to handshake state in endpoint. 168 e.h = h 169 // By the time handshake is created, e.ID is already initialized. 170 e.TSOffset = e.protocol.tsOffset(e.ID.LocalAddress, e.ID.RemoteAddress) 171 timer, err := newBackoffTimer(h.ep.stack.Clock(), InitialRTO, MaxRTO, timerHandler(e, h.retransmitHandlerLocked)) 172 if err != nil { 173 panic(fmt.Sprintf("newBackOffTimer(_, %s, %s, _) failed: %s", InitialRTO, MaxRTO, err)) 174 } 175 h.retransmitTimer = timer 176 return h 177 } 178 179 // +checklocks:e.mu 180 // +checklocksacquire:h.ep.mu 181 func (e *Endpoint) newPassiveHandshake(isn, irs seqnum.Value, opts header.TCPSynOptions, deferAccept time.Duration) (h *handshake) { 182 h = e.newHandshake() 183 h.resetToSynRcvd(isn, irs, opts, deferAccept) 184 return h 185 } 186 187 // FindWndScale determines the window scale to use for the given maximum window 188 // size. 189 func FindWndScale(wnd seqnum.Size) int { 190 if wnd < 0x10000 { 191 return 0 192 } 193 194 max := seqnum.Size(math.MaxUint16) 195 s := 0 196 for wnd > max && s < header.MaxWndScale { 197 s++ 198 max <<= 1 199 } 200 201 return s 202 } 203 204 // resetState resets the state of the handshake object such that it becomes 205 // ready for a new 3-way handshake. 206 func (h *handshake) resetState() { 207 h.state = handshakeSynSent 208 h.flags = header.TCPFlagSyn 209 h.ackNum = 0 210 h.mss = 0 211 h.iss = generateSecureISN(h.ep.TransportEndpointInfo.ID, h.ep.stack.Clock(), h.ep.protocol.seqnumSecret) 212 } 213 214 // generateSecureISN generates a secure Initial Sequence number based on the 215 // recommendation here https://tools.ietf.org/html/rfc6528#page-3. 216 func generateSecureISN(id stack.TransportEndpointID, clock tcpip.Clock, seed [16]byte) seqnum.Value { 217 isnHasher := sha256.New() 218 219 // Per hash.Hash.Writer: 220 // 221 // It never returns an error. 222 _, _ = isnHasher.Write(seed[:]) 223 _, _ = isnHasher.Write(id.LocalAddress.AsSlice()) 224 _, _ = isnHasher.Write(id.RemoteAddress.AsSlice()) 225 portBuf := make([]byte, 2) 226 binary.LittleEndian.PutUint16(portBuf, id.LocalPort) 227 _, _ = isnHasher.Write(portBuf) 228 binary.LittleEndian.PutUint16(portBuf, id.RemotePort) 229 _, _ = isnHasher.Write(portBuf) 230 // The time period here is 64ns. This is similar to what linux uses 231 // generate a sequence number that overlaps less than one 232 // time per MSL (2 minutes). 233 // 234 // A 64ns clock ticks 10^9/64 = 15625000) times in a second. 235 // To wrap the whole 32 bit space would require 236 // 2^32/1562500 ~ 274 seconds. 237 // 238 // Which sort of guarantees that we won't reuse the ISN for a new 239 // connection for the same tuple for at least 274s. 240 hash := binary.LittleEndian.Uint32(isnHasher.Sum(nil)[:4]) 241 isn := hash + uint32(clock.NowMonotonic().Sub(tcpip.MonotonicTime{}).Nanoseconds()>>6) 242 return seqnum.Value(isn) 243 } 244 245 // effectiveRcvWndScale returns the effective receive window scale to be used. 246 // If the peer doesn't support window scaling, the effective rcv wnd scale is 247 // zero; otherwise it's the value calculated based on the initial rcv wnd. 248 func (h *handshake) effectiveRcvWndScale() uint8 { 249 if h.sndWndScale < 0 { 250 return 0 251 } 252 return uint8(h.rcvWndScale) 253 } 254 255 // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD 256 // state. 257 // +checklocks:h.ep.mu 258 func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts header.TCPSynOptions, deferAccept time.Duration) { 259 h.active = false 260 h.state = handshakeSynRcvd 261 h.flags = header.TCPFlagSyn | header.TCPFlagAck 262 h.iss = iss 263 h.ackNum = irs + 1 264 h.mss = opts.MSS 265 h.sndWndScale = opts.WS 266 h.deferAccept = deferAccept 267 h.ep.setEndpointState(StateSynRecv) 268 } 269 270 // checkAck checks if the ACK number, if present, of a segment received during 271 // a TCP 3-way handshake is valid. 272 func (h *handshake) checkAck(s *segment) bool { 273 return !(s.flags.Contains(header.TCPFlagAck) && s.ackNumber != h.iss+1) 274 } 275 276 // synSentState handles a segment received when the TCP 3-way handshake is in 277 // the SYN-SENT state. 278 // +checklocks:h.ep.mu 279 func (h *handshake) synSentState(s *segment) tcpip.Error { 280 // RFC 793, page 37, states that in the SYN-SENT state, a reset is 281 // acceptable if the ack field acknowledges the SYN. 282 if s.flags.Contains(header.TCPFlagRst) { 283 if s.flags.Contains(header.TCPFlagAck) && s.ackNumber == h.iss+1 { 284 // RFC 793, page 67, states that "If the RST bit is set [and] If the ACK 285 // was acceptable then signal the user "error: connection reset", drop 286 // the segment, enter CLOSED state, delete TCB, and return." 287 // Although the RFC above calls out ECONNRESET, Linux actually returns 288 // ECONNREFUSED here so we do as well. 289 return &tcpip.ErrConnectionRefused{} 290 } 291 return nil 292 } 293 294 if !h.checkAck(s) { 295 // RFC 793, page 72 (https://datatracker.ietf.org/doc/html/rfc793#page-72): 296 // If the segment acknowledgment is not acceptable, form a reset segment, 297 // <SEQ=SEG.ACK><CTL=RST> 298 // and send it. 299 h.ep.sendEmptyRaw(header.TCPFlagRst, s.ackNumber, 0, 0) 300 return nil 301 } 302 303 // We are in the SYN-SENT state. We only care about segments that have 304 // the SYN flag. 305 if !s.flags.Contains(header.TCPFlagSyn) { 306 return nil 307 } 308 309 // Parse the SYN options. 310 rcvSynOpts := parseSynSegmentOptions(s) 311 312 // Remember if the Timestamp option was negotiated. 313 h.ep.maybeEnableTimestamp(rcvSynOpts) 314 315 // Remember if the SACKPermitted option was negotiated. 316 h.ep.maybeEnableSACKPermitted(rcvSynOpts) 317 318 // Remember the sequence we'll ack from now on. 319 h.ackNum = s.sequenceNumber + 1 320 h.flags |= header.TCPFlagAck 321 h.mss = rcvSynOpts.MSS 322 h.sndWndScale = rcvSynOpts.WS 323 324 // If this is a SYN ACK response, we only need to acknowledge the SYN 325 // and the handshake is completed. 326 if s.flags.Contains(header.TCPFlagAck) { 327 h.state = handshakeCompleted 328 h.transitionToStateEstablishedLocked(s) 329 330 h.ep.sendEmptyRaw(header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale()) 331 return nil 332 } 333 334 // A SYN segment was received, but no ACK in it. We acknowledge the SYN 335 // but resend our own SYN and wait for it to be acknowledged in the 336 // SYN-RCVD state. 337 h.state = handshakeSynRcvd 338 ttl := calculateTTL(h.ep.route, h.ep.ipv4TTL, h.ep.ipv6HopLimit) 339 amss := h.ep.amss 340 h.ep.setEndpointState(StateSynRecv) 341 synOpts := header.TCPSynOptions{ 342 WS: int(h.effectiveRcvWndScale()), 343 TS: rcvSynOpts.TS, 344 TSVal: h.ep.tsValNow(), 345 TSEcr: h.ep.recentTimestamp(), 346 347 // We only send SACKPermitted if the other side indicated it 348 // permits SACK. This is not explicitly defined in the RFC but 349 // this is the behaviour implemented by Linux. 350 SACKPermitted: rcvSynOpts.SACKPermitted, 351 MSS: amss, 352 } 353 if ttl == 0 { 354 ttl = h.ep.route.DefaultTTL() 355 } 356 h.ep.sendSynTCP(h.ep.route, tcpFields{ 357 id: h.ep.TransportEndpointInfo.ID, 358 ttl: ttl, 359 tos: h.ep.sendTOS, 360 flags: h.flags, 361 seq: h.iss, 362 ack: h.ackNum, 363 rcvWnd: h.rcvWnd, 364 }, synOpts) 365 return nil 366 } 367 368 // synRcvdState handles a segment received when the TCP 3-way handshake is in 369 // the SYN-RCVD state. 370 // +checklocks:h.ep.mu 371 func (h *handshake) synRcvdState(s *segment) tcpip.Error { 372 if s.flags.Contains(header.TCPFlagRst) { 373 // RFC 793, page 37, states that in the SYN-RCVD state, a reset 374 // is acceptable if the sequence number is in the window. 375 if s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) { 376 return &tcpip.ErrConnectionRefused{} 377 } 378 return nil 379 } 380 381 // It's possible that s is an ACK of a SYN cookie. This can happen if: 382 // 383 // - We receive a SYN while under load and issue a SYN/ACK with 384 // cookie S. 385 // - We receive a retransmitted SYN while space exists in the SYN 386 // queue, and issue a SYN/ACK with seqnum S'. 387 // - We receive the ACK based on S. 388 // 389 // If we receive a SYN cookie ACK, just use the cookie seqnum. 390 if !h.checkAck(s) && h.listenEP != nil { 391 iss := s.ackNumber - 1 392 data, ok := h.listenEP.listenCtx.isCookieValid(s.id, iss, s.sequenceNumber-1) 393 if !ok || int(data) >= len(mssTable) { 394 // This isn't a valid cookie. 395 // RFC 793, page 72 (https://datatracker.ietf.org/doc/html/rfc793#page-72): 396 // If the segment acknowledgment is not acceptable, form a reset segment, 397 // <SEQ=SEG.ACK><CTL=RST> 398 // and send it. 399 h.ep.sendEmptyRaw(header.TCPFlagRst, s.ackNumber, 0, 0) 400 return nil 401 } 402 // This is a cookie that snuck its way in after we stopped using them. 403 h.mss = mssTable[data] 404 h.iss = iss 405 } 406 407 // RFC 793, Section 3.9, page 69, states that in the SYN-RCVD state, a 408 // sequence number outside of the window causes an ACK with the proper seq 409 // number and "After sending the acknowledgment, drop the unacceptable 410 // segment and return." 411 if !s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) { 412 if h.ep.allowOutOfWindowAck() { 413 h.ep.sendEmptyRaw(header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd) 414 } 415 return nil 416 } 417 418 if s.flags.Contains(header.TCPFlagSyn) && s.sequenceNumber != h.ackNum-1 { 419 // We received two SYN segments with different sequence 420 // numbers, so we reset this and restart the whole 421 // process, except that we don't reset the timer. 422 ack := s.sequenceNumber.Add(s.logicalLen()) 423 seq := seqnum.Value(0) 424 if s.flags.Contains(header.TCPFlagAck) { 425 seq = s.ackNumber 426 } 427 h.ep.sendEmptyRaw(header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0) 428 429 if !h.active { 430 return &tcpip.ErrInvalidEndpointState{} 431 } 432 433 h.resetState() 434 synOpts := header.TCPSynOptions{ 435 WS: h.rcvWndScale, 436 TS: h.ep.SendTSOk, 437 TSVal: h.ep.tsValNow(), 438 TSEcr: h.ep.recentTimestamp(), 439 SACKPermitted: h.ep.SACKPermitted, 440 MSS: h.ep.amss, 441 } 442 h.ep.sendSynTCP(h.ep.route, tcpFields{ 443 id: h.ep.TransportEndpointInfo.ID, 444 ttl: calculateTTL(h.ep.route, h.ep.ipv4TTL, h.ep.ipv6HopLimit), 445 tos: h.ep.sendTOS, 446 flags: h.flags, 447 seq: h.iss, 448 ack: h.ackNum, 449 rcvWnd: h.rcvWnd, 450 }, synOpts) 451 return nil 452 } 453 454 // We have previously received (and acknowledged) the peer's SYN. If the 455 // peer acknowledges our SYN, the handshake is completed. 456 if s.flags.Contains(header.TCPFlagAck) { 457 // If deferAccept is not zero and this is a bare ACK and the 458 // timeout is not hit then drop the ACK. 459 if h.deferAccept != 0 && s.payloadSize() == 0 && h.ep.stack.Clock().NowMonotonic().Sub(h.startTime) < h.deferAccept { 460 h.acked = true 461 h.ep.stack.Stats().DroppedPackets.Increment() 462 return nil 463 } 464 465 // If the timestamp option is negotiated and the segment does 466 // not carry a timestamp option then the segment must be dropped 467 // as per https://tools.ietf.org/html/rfc7323#section-3.2. 468 if h.ep.SendTSOk && !s.parsedOptions.TS { 469 h.ep.stack.Stats().DroppedPackets.Increment() 470 return nil 471 } 472 473 // Drop the ACK if the accept queue is full. 474 // https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_ipv4.c#L1523 475 // We could abort the connection as well with a tunable as in 476 // https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_minisocks.c#L788 477 if listenEP := h.listenEP; listenEP != nil && listenEP.acceptQueueIsFull() { 478 listenEP.stack.Stats().DroppedPackets.Increment() 479 return nil 480 } 481 482 // Update timestamp if required. See RFC7323, section-4.3. 483 if h.ep.SendTSOk && s.parsedOptions.TS { 484 h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber) 485 } 486 487 h.state = handshakeCompleted 488 h.transitionToStateEstablishedLocked(s) 489 490 // Requeue the segment if the ACK completing the handshake has more info 491 // to be processed by the newly established endpoint. 492 if (s.flags.Contains(header.TCPFlagFin) || s.payloadSize() > 0) && h.ep.enqueueSegment(s) { 493 h.ep.protocol.dispatcher.selectProcessor(h.ep.ID).queueEndpoint(h.ep) 494 495 } 496 return nil 497 } 498 499 return nil 500 } 501 502 // +checklocks:h.ep.mu 503 func (h *handshake) handleSegment(s *segment) tcpip.Error { 504 h.sndWnd = s.window 505 if !s.flags.Contains(header.TCPFlagSyn) && h.sndWndScale > 0 { 506 h.sndWnd <<= uint8(h.sndWndScale) 507 } 508 509 switch h.state { 510 case handshakeSynRcvd: 511 return h.synRcvdState(s) 512 case handshakeSynSent: 513 return h.synSentState(s) 514 } 515 return nil 516 } 517 518 // processSegments goes through the segment queue and processes up to 519 // maxSegmentsPerWake (if they're available). 520 // +checklocks:h.ep.mu 521 func (h *handshake) processSegments() tcpip.Error { 522 for i := 0; i < maxSegmentsPerWake; i++ { 523 s := h.ep.segmentQueue.dequeue() 524 if s == nil { 525 return nil 526 } 527 528 err := h.handleSegment(s) 529 s.DecRef() 530 if err != nil { 531 return err 532 } 533 534 // We stop processing packets once the handshake is completed, 535 // otherwise we may process packets meant to be processed by 536 // the main protocol goroutine. 537 if h.state == handshakeCompleted { 538 break 539 } 540 } 541 542 return nil 543 } 544 545 // start sends the first SYN/SYN-ACK. It does not block, even if link address 546 // resolution is required. 547 func (h *handshake) start() { 548 h.startTime = h.ep.stack.Clock().NowMonotonic() 549 h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route) 550 var sackEnabled tcpip.TCPSACKEnabled 551 if err := h.ep.stack.TransportProtocolOption(ProtocolNumber, &sackEnabled); err != nil { 552 // If stack returned an error when checking for SACKEnabled 553 // status then just default to switching off SACK negotiation. 554 sackEnabled = false 555 } 556 557 synOpts := header.TCPSynOptions{ 558 WS: h.rcvWndScale, 559 TS: true, 560 TSVal: h.ep.tsValNow(), 561 TSEcr: h.ep.recentTimestamp(), 562 SACKPermitted: bool(sackEnabled), 563 MSS: h.ep.amss, 564 } 565 566 // start() is also called in a listen context so we want to make sure we only 567 // send the TS/SACK option when we received the TS/SACK in the initial SYN. 568 if h.state == handshakeSynRcvd { 569 synOpts.TS = h.ep.SendTSOk 570 synOpts.SACKPermitted = h.ep.SACKPermitted && bool(sackEnabled) 571 if h.sndWndScale < 0 { 572 // Disable window scaling if the peer did not send us 573 // the window scaling option. 574 synOpts.WS = -1 575 } 576 } 577 578 h.sendSYNOpts = synOpts 579 h.ep.sendSynTCP(h.ep.route, tcpFields{ 580 id: h.ep.TransportEndpointInfo.ID, 581 ttl: calculateTTL(h.ep.route, h.ep.ipv4TTL, h.ep.ipv6HopLimit), 582 tos: h.ep.sendTOS, 583 flags: h.flags, 584 seq: h.iss, 585 ack: h.ackNum, 586 rcvWnd: h.rcvWnd, 587 }, synOpts) 588 } 589 590 // retransmitHandler handles retransmissions of un-acked SYNs. 591 // +checklocks:h.ep.mu 592 func (h *handshake) retransmitHandlerLocked() tcpip.Error { 593 e := h.ep 594 // If the endpoint has already transition out of a connecting state due 595 // to say an error (e.g) peer send RST or an ICMP error. Then just 596 // return. Any required cleanup should have been done when the RST/error 597 // was handled. 598 if !e.EndpointState().connecting() { 599 return nil 600 } 601 602 if err := h.retransmitTimer.reset(); err != nil { 603 return err 604 } 605 606 // Resend the SYN/SYN-ACK only if the following conditions hold. 607 // - It's an active handshake (deferAccept does not apply) 608 // - It's a passive handshake and we have not yet got the final-ACK. 609 // - It's a passive handshake and we got an ACK but deferAccept is 610 // enabled and we are now past the deferAccept duration. 611 // The last is required to provide a way for the peer to complete 612 // the connection with another ACK or data (as ACKs are never 613 // retransmitted on their own). 614 if h.active || !h.acked || h.deferAccept != 0 && e.stack.Clock().NowMonotonic().Sub(h.startTime) > h.deferAccept { 615 e.sendSynTCP(e.route, tcpFields{ 616 id: e.TransportEndpointInfo.ID, 617 ttl: calculateTTL(e.route, e.ipv4TTL, e.ipv6HopLimit), 618 tos: e.sendTOS, 619 flags: h.flags, 620 seq: h.iss, 621 ack: h.ackNum, 622 rcvWnd: h.rcvWnd, 623 }, h.sendSYNOpts) 624 // If we have ever retransmitted the SYN-ACK or 625 // SYN segment, we should only measure RTT if 626 // TS option is present. 627 h.sampleRTTWithTSOnly = true 628 } 629 return nil 630 } 631 632 // transitionToStateEstablisedLocked transitions the endpoint of the handshake 633 // to an established state given the last segment received from peer. It also 634 // initializes sender/receiver. 635 // +checklocks:h.ep.mu 636 func (h *handshake) transitionToStateEstablishedLocked(s *segment) { 637 // Stop the SYN retransmissions now that handshake is complete. 638 if h.retransmitTimer != nil { 639 h.retransmitTimer.stop() 640 } 641 642 // Transfer handshake state to TCP connection. We disable 643 // receive window scaling if the peer doesn't support it 644 // (indicated by a negative send window scale). 645 h.ep.snd = newSender(h.ep, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale) 646 647 now := h.ep.stack.Clock().NowMonotonic() 648 649 var rtt time.Duration 650 if h.ep.SendTSOk && s.parsedOptions.TSEcr != 0 { 651 rtt = h.ep.elapsed(now, s.parsedOptions.TSEcr) 652 } 653 if !h.sampleRTTWithTSOnly && rtt == 0 { 654 rtt = now.Sub(h.startTime) 655 } 656 657 if rtt > 0 { 658 h.ep.snd.updateRTO(rtt) 659 } 660 661 h.ep.rcvQueueMu.Lock() 662 h.ep.rcv = newReceiver(h.ep, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale()) 663 // Bootstrap the auto tuning algorithm. Starting at zero will 664 // result in a really large receive window after the first auto 665 // tuning adjustment. 666 h.ep.RcvAutoParams.PrevCopiedBytes = int(h.rcvWnd) 667 h.ep.rcvQueueMu.Unlock() 668 669 h.ep.setEndpointState(StateEstablished) 670 671 // Completing the 3-way handshake is an indication that the route is valid 672 // and the remote is reachable as the only way we can complete a handshake 673 // is if our SYN reached the remote and their ACK reached us. 674 h.ep.route.ConfirmReachable() 675 676 // Tell waiters that the endpoint is connected and writable. 677 h.ep.waiterQueue.Notify(waiter.WritableEvents) 678 } 679 680 type backoffTimer struct { 681 timeout time.Duration 682 maxTimeout time.Duration 683 t tcpip.Timer 684 } 685 686 func newBackoffTimer(clock tcpip.Clock, timeout, maxTimeout time.Duration, f func()) (*backoffTimer, tcpip.Error) { 687 if timeout > maxTimeout { 688 return nil, &tcpip.ErrTimeout{} 689 } 690 bt := &backoffTimer{timeout: timeout, maxTimeout: maxTimeout} 691 bt.t = clock.AfterFunc(timeout, f) 692 return bt, nil 693 } 694 695 func (bt *backoffTimer) reset() tcpip.Error { 696 bt.timeout *= 2 697 if bt.timeout > bt.maxTimeout { 698 return &tcpip.ErrTimeout{} 699 } 700 bt.t.Reset(bt.timeout) 701 return nil 702 } 703 704 func (bt *backoffTimer) stop() { 705 bt.t.Stop() 706 } 707 708 func parseSynSegmentOptions(s *segment) header.TCPSynOptions { 709 synOpts := header.ParseSynOptions(s.options, s.flags.Contains(header.TCPFlagAck)) 710 if synOpts.TS { 711 s.parsedOptions.TSVal = synOpts.TSVal 712 s.parsedOptions.TSEcr = synOpts.TSEcr 713 } 714 return synOpts 715 } 716 717 var optionPool = sync.Pool{ 718 New: func() any { 719 return &[maxOptionSize]byte{} 720 }, 721 } 722 723 func getOptions() []byte { 724 return (*optionPool.Get().(*[maxOptionSize]byte))[:] 725 } 726 727 func putOptions(options []byte) { 728 // Reslice to full capacity. 729 optionPool.Put(optionsToArray(options)) 730 } 731 732 func makeSynOptions(opts header.TCPSynOptions) []byte { 733 // Emulate linux option order. This is as follows: 734 // 735 // if md5: NOP NOP MD5SIG 18 md5sig(16) 736 // if mss: MSS 4 mss(2) 737 // if ts and sack_advertise: 738 // SACK 2 TIMESTAMP 2 timestamp(8) 739 // elif ts: NOP NOP TIMESTAMP 10 timestamp(8) 740 // elif sack: NOP NOP SACK 2 741 // if wscale: NOP WINDOW 3 ws(1) 742 // if sack_blocks: NOP NOP SACK ((2 + (#blocks * 8)) 743 // [for each block] start_seq(4) end_seq(4) 744 // if fastopen_cookie: 745 // if exp: EXP (4 + len(cookie)) FASTOPEN_MAGIC(2) 746 // else: FASTOPEN (2 + len(cookie)) 747 // cookie(variable) [padding to four bytes] 748 // 749 options := getOptions() 750 751 // Always encode the mss. 752 offset := header.EncodeMSSOption(uint32(opts.MSS), options) 753 754 // Special ordering is required here. If both TS and SACK are enabled, 755 // then the SACK option precedes TS, with no padding. If they are 756 // enabled individually, then we see padding before the option. 757 if opts.TS && opts.SACKPermitted { 758 offset += header.EncodeSACKPermittedOption(options[offset:]) 759 offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:]) 760 } else if opts.TS { 761 offset += header.EncodeNOP(options[offset:]) 762 offset += header.EncodeNOP(options[offset:]) 763 offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:]) 764 } else if opts.SACKPermitted { 765 offset += header.EncodeNOP(options[offset:]) 766 offset += header.EncodeNOP(options[offset:]) 767 offset += header.EncodeSACKPermittedOption(options[offset:]) 768 } 769 770 // Initialize the WS option. 771 if opts.WS >= 0 { 772 offset += header.EncodeNOP(options[offset:]) 773 offset += header.EncodeWSOption(opts.WS, options[offset:]) 774 } 775 776 // Padding to the end; note that this never apply unless we add a 777 // fastopen option, we always expect the offset to remain the same. 778 if delta := header.AddTCPOptionPadding(options, offset); delta != 0 { 779 panic("unexpected option encoding") 780 } 781 782 return options[:offset] 783 } 784 785 // tcpFields is a struct to carry different parameters required by the 786 // send*TCP variant functions below. 787 type tcpFields struct { 788 id stack.TransportEndpointID 789 ttl uint8 790 tos uint8 791 flags header.TCPFlags 792 seq seqnum.Value 793 ack seqnum.Value 794 rcvWnd seqnum.Size 795 opts []byte 796 txHash uint32 797 df bool 798 } 799 800 func (e *Endpoint) sendSynTCP(r *stack.Route, tf tcpFields, opts header.TCPSynOptions) tcpip.Error { 801 tf.opts = makeSynOptions(opts) 802 // We ignore SYN send errors and let the callers re-attempt send. 803 p := stack.NewPacketBuffer(stack.PacketBufferOptions{ReserveHeaderBytes: header.TCPMinimumSize + int(r.MaxHeaderLength()) + len(tf.opts)}) 804 defer p.DecRef() 805 if err := e.sendTCP(r, tf, p, stack.GSO{}); err != nil { 806 e.stats.SendErrors.SynSendToNetworkFailed.Increment() 807 } 808 putOptions(tf.opts) 809 return nil 810 } 811 812 // This method takes ownership of pkt. 813 func (e *Endpoint) sendTCP(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso stack.GSO) tcpip.Error { 814 tf.txHash = e.txHash 815 if err := sendTCP(r, tf, pkt, gso, e.owner); err != nil { 816 e.stats.SendErrors.SegmentSendToNetworkFailed.Increment() 817 return err 818 } 819 e.stats.SegmentsSent.Increment() 820 return nil 821 } 822 823 func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso stack.GSO) { 824 optLen := len(tf.opts) 825 tcp := header.TCP(pkt.TransportHeader().Push(header.TCPMinimumSize + optLen)) 826 pkt.TransportProtocolNumber = header.TCPProtocolNumber 827 tcp.Encode(&header.TCPFields{ 828 SrcPort: tf.id.LocalPort, 829 DstPort: tf.id.RemotePort, 830 SeqNum: uint32(tf.seq), 831 AckNum: uint32(tf.ack), 832 DataOffset: uint8(header.TCPMinimumSize + optLen), 833 Flags: tf.flags, 834 WindowSize: uint16(tf.rcvWnd), 835 }) 836 copy(tcp[header.TCPMinimumSize:], tf.opts) 837 838 xsum := r.PseudoHeaderChecksum(ProtocolNumber, uint16(pkt.Size())) 839 // Only calculate the checksum if offloading isn't supported. 840 if gso.Type != stack.GSONone && gso.NeedsCsum { 841 // This is called CHECKSUM_PARTIAL in the Linux kernel. We 842 // calculate a checksum of the pseudo-header and save it in the 843 // TCP header, then the kernel calculate a checksum of the 844 // header and data and get the right sum of the TCP packet. 845 tcp.SetChecksum(xsum) 846 } else if r.RequiresTXTransportChecksum() { 847 xsum = checksum.Combine(xsum, pkt.Data().Checksum()) 848 tcp.SetChecksum(^tcp.CalculateChecksum(xsum)) 849 } 850 } 851 852 func sendTCPBatch(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error { 853 optLen := len(tf.opts) 854 if tf.rcvWnd > math.MaxUint16 { 855 tf.rcvWnd = math.MaxUint16 856 } 857 858 mss := int(gso.MSS) 859 n := (pkt.Data().Size() + mss - 1) / mss 860 861 size := pkt.Data().Size() 862 hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen 863 for i := 0; i < n; i++ { 864 packetSize := mss 865 if packetSize > size { 866 packetSize = size 867 } 868 size -= packetSize 869 870 pkt := pkt 871 // No need to split the packet in the final iteration. The original 872 // packet already has the truncated data. 873 shouldSplitPacket := i != n-1 874 if shouldSplitPacket { 875 splitPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ReserveHeaderBytes: hdrSize}) 876 splitPkt.Data().ReadFromPacketData(pkt.Data(), packetSize) 877 pkt = splitPkt 878 } 879 pkt.Hash = tf.txHash 880 pkt.Owner = owner 881 882 buildTCPHdr(r, tf, pkt, gso) 883 tf.seq = tf.seq.Add(seqnum.Size(packetSize)) 884 pkt.GSOOptions = gso 885 if err := r.WritePacket(stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos, DF: tf.df}, pkt); err != nil { 886 r.Stats().TCP.SegmentSendErrors.Increment() 887 if shouldSplitPacket { 888 pkt.DecRef() 889 } 890 return err 891 } 892 r.Stats().TCP.SegmentsSent.Increment() 893 if shouldSplitPacket { 894 pkt.DecRef() 895 } 896 } 897 return nil 898 } 899 900 // sendTCP sends a TCP segment with the provided options via the provided 901 // network endpoint and under the provided identity. This method takes 902 // ownership of pkt. 903 func sendTCP(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error { 904 if tf.rcvWnd > math.MaxUint16 { 905 tf.rcvWnd = math.MaxUint16 906 } 907 908 if r.Loop()&stack.PacketLoop == 0 && gso.Type == stack.GSOGvisor && int(gso.MSS) < pkt.Data().Size() { 909 return sendTCPBatch(r, tf, pkt, gso, owner) 910 } 911 912 pkt.GSOOptions = gso 913 pkt.Hash = tf.txHash 914 pkt.Owner = owner 915 buildTCPHdr(r, tf, pkt, gso) 916 917 if err := r.WritePacket(stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos, DF: tf.df}, pkt); err != nil { 918 r.Stats().TCP.SegmentSendErrors.Increment() 919 return err 920 } 921 r.Stats().TCP.SegmentsSent.Increment() 922 if (tf.flags & header.TCPFlagRst) != 0 { 923 r.Stats().TCP.ResetsSent.Increment() 924 } 925 return nil 926 } 927 928 // makeOptions makes an options slice. 929 func (e *Endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte { 930 options := getOptions() 931 offset := 0 932 933 // N.B. the ordering here matches the ordering used by Linux internally 934 // and described in the raw makeOptions function. We don't include 935 // unnecessary cases here (post connection.) 936 if e.SendTSOk { 937 // Embed the timestamp if timestamp has been enabled. 938 // 939 // We only use the lower 32 bits of the unix time in 940 // milliseconds. This is similar to what Linux does where it 941 // uses the lower 32 bits of the jiffies value in the tsVal 942 // field of the timestamp option. 943 // 944 // Further, RFC7323 section-5.4 recommends millisecond 945 // resolution as the lowest recommended resolution for the 946 // timestamp clock. 947 // 948 // Ref: https://tools.ietf.org/html/rfc7323#section-5.4. 949 offset += header.EncodeNOP(options[offset:]) 950 offset += header.EncodeNOP(options[offset:]) 951 offset += header.EncodeTSOption(e.tsValNow(), e.recentTimestamp(), options[offset:]) 952 } 953 if e.SACKPermitted && len(sackBlocks) > 0 { 954 offset += header.EncodeNOP(options[offset:]) 955 offset += header.EncodeNOP(options[offset:]) 956 offset += header.EncodeSACKBlocks(sackBlocks, options[offset:]) 957 } 958 959 // We expect the above to produce an aligned offset. 960 if delta := header.AddTCPOptionPadding(options, offset); delta != 0 { 961 panic("unexpected option encoding") 962 } 963 964 return options[:offset] 965 } 966 967 // sendEmptyRaw sends a TCP segment with no payload to the endpoint's peer. 968 // 969 // +checklocks:e.mu 970 // +checklocksalias:e.snd.ep.mu=e.mu 971 func (e *Endpoint) sendEmptyRaw(flags header.TCPFlags, seq, ack seqnum.Value, rcvWnd seqnum.Size) tcpip.Error { 972 pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{}) 973 defer pkt.DecRef() 974 return e.sendRaw(pkt, flags, seq, ack, rcvWnd) 975 } 976 977 // sendRaw sends a TCP segment to the endpoint's peer. This method takes 978 // ownership of pkt. pkt must not have any headers set. 979 // 980 // +checklocks:e.mu 981 // +checklocksalias:e.snd.ep.mu=e.mu 982 func (e *Endpoint) sendRaw(pkt *stack.PacketBuffer, flags header.TCPFlags, seq, ack seqnum.Value, rcvWnd seqnum.Size) tcpip.Error { 983 var sackBlocks []header.SACKBlock 984 if e.EndpointState() == StateEstablished && e.rcv.pendingRcvdSegments.Len() > 0 && (flags&header.TCPFlagAck != 0) { 985 sackBlocks = e.sack.Blocks[:e.sack.NumBlocks] 986 } 987 options := e.makeOptions(sackBlocks) 988 defer putOptions(options) 989 pkt.ReserveHeaderBytes(header.TCPMinimumSize + int(e.route.MaxHeaderLength()) + len(options)) 990 return e.sendTCP(e.route, tcpFields{ 991 id: e.TransportEndpointInfo.ID, 992 ttl: calculateTTL(e.route, e.ipv4TTL, e.ipv6HopLimit), 993 tos: e.sendTOS, 994 flags: flags, 995 seq: seq, 996 ack: ack, 997 rcvWnd: rcvWnd, 998 opts: options, 999 df: e.pmtud == tcpip.PMTUDiscoveryWant || e.pmtud == tcpip.PMTUDiscoveryDo, 1000 }, pkt, e.gso) 1001 } 1002 1003 // +checklocks:e.mu 1004 // +checklocksalias:e.snd.ep.mu=e.mu 1005 func (e *Endpoint) sendData(next *segment) { 1006 // Initialize the next segment to write if it's currently nil. 1007 if e.snd.writeNext == nil { 1008 if next == nil { 1009 return 1010 } 1011 e.snd.updateWriteNext(next) 1012 } 1013 1014 // Push out any new packets. 1015 e.snd.sendData() 1016 } 1017 1018 // resetConnectionLocked puts the endpoint in an error state with the given 1019 // error code and sends a RST if and only if the error is not ErrConnectionReset 1020 // indicating that the connection is being reset due to receiving a RST. This 1021 // method must only be called from the protocol goroutine. 1022 // +checklocks:e.mu 1023 func (e *Endpoint) resetConnectionLocked(err tcpip.Error) { 1024 // Only send a reset if the connection is being aborted for a reason 1025 // other than receiving a reset. 1026 e.hardError = err 1027 switch err.(type) { 1028 case *tcpip.ErrConnectionReset, *tcpip.ErrTimeout: 1029 default: 1030 // The exact sequence number to be used for the RST is the same as the 1031 // one used by Linux. We need to handle the case of window being shrunk 1032 // which can cause sndNxt to be outside the acceptable window on the 1033 // receiver. 1034 // 1035 // See: https://www.snellman.net/blog/archive/2016-02-01-tcp-rst/ for more 1036 // information. 1037 sndWndEnd := e.snd.SndUna.Add(e.snd.SndWnd) 1038 resetSeqNum := sndWndEnd 1039 if !sndWndEnd.LessThan(e.snd.SndNxt) || e.snd.SndNxt.Size(sndWndEnd) < (1<<e.snd.SndWndScale) { 1040 resetSeqNum = e.snd.SndNxt 1041 } 1042 e.sendEmptyRaw(header.TCPFlagAck|header.TCPFlagRst, resetSeqNum, e.rcv.RcvNxt, 0) 1043 } 1044 // Don't purge read queues here. If there's buffered data, it's still allowed 1045 // to be read. 1046 e.purgeWriteQueue() 1047 e.purgePendingRcvQueue() 1048 e.cleanupLocked() 1049 e.setEndpointState(StateError) 1050 } 1051 1052 // transitionToStateCloseLocked ensures that the endpoint is 1053 // cleaned up from the transport demuxer, "before" moving to 1054 // StateClose. This will ensure that no packet will be 1055 // delivered to this endpoint from the demuxer when the endpoint 1056 // is transitioned to StateClose. 1057 // +checklocks:e.mu 1058 func (e *Endpoint) transitionToStateCloseLocked() { 1059 s := e.EndpointState() 1060 if s == StateClose { 1061 return 1062 } 1063 1064 if s.connected() { 1065 e.stack.Stats().TCP.EstablishedClosed.Increment() 1066 } 1067 1068 e.cleanupLocked() 1069 // Mark the endpoint as fully closed for reads/writes. 1070 e.setEndpointState(StateClose) 1071 } 1072 1073 // tryDeliverSegmentFromClosedEndpoint attempts to deliver the parsed 1074 // segment to any other endpoint other than the current one. This is called 1075 // only when the endpoint is in StateClose and we want to deliver the segment 1076 // to any other listening endpoint. We reply with RST if we cannot find one. 1077 func (e *Endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) { 1078 ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.TransportEndpointInfo.ID, s.pkt.NICID) 1079 if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.TransportEndpointInfo.ID.LocalAddress.To4() != (tcpip.Address{}) { 1080 // Dual-stack socket, try IPv4. 1081 ep = e.stack.FindTransportEndpoint( 1082 header.IPv4ProtocolNumber, 1083 e.TransProto, 1084 e.TransportEndpointInfo.ID, 1085 s.pkt.NICID, 1086 ) 1087 } 1088 if ep == nil { 1089 if !s.flags.Contains(header.TCPFlagRst) { 1090 replyWithReset(e.stack, s, stack.DefaultTOS, tcpip.UseDefaultIPv4TTL, tcpip.UseDefaultIPv6HopLimit) 1091 } 1092 return 1093 } 1094 1095 if e == ep { 1096 panic(fmt.Sprintf("current endpoint not removed from demuxer, enqueuing segments to itself, endpoint in state %v", e.EndpointState())) 1097 } 1098 1099 if ep := ep.(*Endpoint); ep.enqueueSegment(s) { 1100 ep.notifyProcessor() 1101 } 1102 } 1103 1104 // Drain segment queue from the endpoint and try to re-match the segment to a 1105 // different endpoint. This is used when the current endpoint is transitioned to 1106 // StateClose and has been unregistered from the transport demuxer. 1107 func (e *Endpoint) drainClosingSegmentQueue() { 1108 for { 1109 s := e.segmentQueue.dequeue() 1110 if s == nil { 1111 break 1112 } 1113 1114 e.tryDeliverSegmentFromClosedEndpoint(s) 1115 s.DecRef() 1116 } 1117 } 1118 1119 // +checklocks:e.mu 1120 func (e *Endpoint) handleReset(s *segment) (ok bool, err tcpip.Error) { 1121 if e.rcv.acceptable(s.sequenceNumber, 0) { 1122 // RFC 793, page 37 states that "in all states 1123 // except SYN-SENT, all reset (RST) segments are 1124 // validated by checking their SEQ-fields." So 1125 // we only process it if it's acceptable. 1126 switch e.EndpointState() { 1127 // In case of a RST in CLOSE-WAIT linux moves 1128 // the socket to closed state with an error set 1129 // to indicate EPIPE. 1130 // 1131 // Technically this seems to be at odds w/ RFC. 1132 // As per https://tools.ietf.org/html/rfc793#section-2.7 1133 // page 69 the behavior for a segment arriving 1134 // w/ RST bit set in CLOSE-WAIT is inlined below. 1135 // 1136 // ESTABLISHED 1137 // FIN-WAIT-1 1138 // FIN-WAIT-2 1139 // CLOSE-WAIT 1140 1141 // If the RST bit is set then, any outstanding RECEIVEs and 1142 // SEND should receive "reset" responses. All segment queues 1143 // should be flushed. Users should also receive an unsolicited 1144 // general "connection reset" signal. Enter the CLOSED state, 1145 // delete the TCB, and return. 1146 case StateCloseWait: 1147 e.transitionToStateCloseLocked() 1148 e.hardError = &tcpip.ErrAborted{} 1149 return false, nil 1150 default: 1151 // RFC 793, page 37 states that "in all states 1152 // except SYN-SENT, all reset (RST) segments are 1153 // validated by checking their SEQ-fields." So 1154 // we only process it if it's acceptable. 1155 1156 // Notify protocol goroutine. This is required when 1157 // handleSegment is invoked from the processor goroutine 1158 // rather than the worker goroutine. 1159 return false, &tcpip.ErrConnectionReset{} 1160 } 1161 } 1162 return true, nil 1163 } 1164 1165 // handleSegments processes all inbound segments. 1166 // 1167 // +checklocks:e.mu 1168 // +checklocksalias:e.snd.ep.mu=e.mu 1169 func (e *Endpoint) handleSegmentsLocked() tcpip.Error { 1170 sndUna := e.snd.SndUna 1171 for i := 0; i < maxSegmentsPerWake; i++ { 1172 if state := e.EndpointState(); state.closed() || state == StateTimeWait || state == StateError { 1173 return nil 1174 } 1175 s := e.segmentQueue.dequeue() 1176 if s == nil { 1177 break 1178 } 1179 cont, err := e.handleSegmentLocked(s) 1180 s.DecRef() 1181 if err != nil { 1182 return err 1183 } 1184 if !cont { 1185 return nil 1186 } 1187 } 1188 1189 // The remote ACK-ing at least 1 byte is an indication that we have a 1190 // full-duplex connection to the remote as the only way we will receive an 1191 // ACK is if the remote received data that we previously sent. 1192 // 1193 // As of writing, Linux seems to only confirm a route as reachable when 1194 // forward progress is made which is indicated by an ACK that removes data 1195 // from the retransmit queue, i.e. sender makes forward progress. 1196 if sndUna.LessThan(e.snd.SndUna) { 1197 e.route.ConfirmReachable() 1198 } 1199 1200 // Send an ACK for all processed packets if needed. 1201 if e.rcv.RcvNxt != e.snd.MaxSentAck { 1202 e.snd.sendAck() 1203 } 1204 1205 e.resetKeepaliveTimer(true /* receivedData */) 1206 1207 return nil 1208 } 1209 1210 // +checklocks:e.mu 1211 func (e *Endpoint) probeSegmentLocked() { 1212 if fn := e.probe; fn != nil { 1213 var state stack.TCPEndpointState 1214 e.completeStateLocked(&state) 1215 fn(&state) 1216 } 1217 } 1218 1219 // handleSegment handles a given segment and notifies the worker goroutine if 1220 // if the connection should be terminated. 1221 // 1222 // +checklocks:e.mu 1223 // +checklocksalias:e.rcv.ep.mu=e.mu 1224 // +checklocksalias:e.snd.ep.mu=e.mu 1225 func (e *Endpoint) handleSegmentLocked(s *segment) (cont bool, err tcpip.Error) { 1226 // Invoke the tcp probe if installed. The tcp probe function will update 1227 // the TCPEndpointState after the segment is processed. 1228 defer e.probeSegmentLocked() 1229 1230 if s.flags.Contains(header.TCPFlagRst) { 1231 if ok, err := e.handleReset(s); !ok { 1232 return false, err 1233 } 1234 } else if s.flags.Contains(header.TCPFlagSyn) { 1235 // See: https://tools.ietf.org/html/rfc5961#section-4.1 1236 // 1) If the SYN bit is set, irrespective of the sequence number, TCP 1237 // MUST send an ACK (also referred to as challenge ACK) to the remote 1238 // peer: 1239 // 1240 // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK> 1241 // 1242 // After sending the acknowledgment, TCP MUST drop the unacceptable 1243 // segment and stop processing further. 1244 // 1245 // By sending an ACK, the remote peer is challenged to confirm the loss 1246 // of the previous connection and the request to start a new connection. 1247 // A legitimate peer, after restart, would not have a TCB in the 1248 // synchronized state. Thus, when the ACK arrives, the peer should send 1249 // a RST segment back with the sequence number derived from the ACK 1250 // field that caused the RST. 1251 1252 // This RST will confirm that the remote peer has indeed closed the 1253 // previous connection. Upon receipt of a valid RST, the local TCP 1254 // endpoint MUST terminate its connection. The local TCP endpoint 1255 // should then rely on SYN retransmission from the remote end to 1256 // re-establish the connection. 1257 e.snd.maybeSendOutOfWindowAck(s) 1258 } else if s.flags.Contains(header.TCPFlagAck) { 1259 // Patch the window size in the segment according to the 1260 // send window scale. 1261 s.window <<= e.snd.SndWndScale 1262 1263 // RFC 793, page 41 states that "once in the ESTABLISHED 1264 // state all segments must carry current acknowledgment 1265 // information." 1266 drop, err := e.rcv.handleRcvdSegment(s) 1267 if err != nil { 1268 return false, err 1269 } 1270 if drop { 1271 return true, nil 1272 } 1273 1274 // Now check if the received segment has caused us to transition 1275 // to a CLOSED state, if yes then terminate processing and do 1276 // not invoke the sender. 1277 state := e.EndpointState() 1278 if state == StateClose { 1279 // When we get into StateClose while processing from the queue, 1280 // return immediately and let the protocolMainloop handle it. 1281 // 1282 // We can reach StateClose only while processing a previous segment 1283 // or a notification from the protocolMainLoop (caller goroutine). 1284 // This means that with this return, the segment dequeue below can 1285 // never occur on a closed endpoint. 1286 return false, nil 1287 } 1288 1289 e.snd.handleRcvdSegment(s) 1290 } 1291 1292 return true, nil 1293 } 1294 1295 // keepaliveTimerExpired is called when the keepaliveTimer fires. We send TCP 1296 // keepalive packets periodically when the connection is idle. If we don't hear 1297 // from the other side after a number of tries, we terminate the connection. 1298 // +checklocks:e.mu 1299 // +checklocksalias:e.snd.ep.mu=e.mu 1300 func (e *Endpoint) keepaliveTimerExpired() tcpip.Error { 1301 userTimeout := e.userTimeout 1302 1303 // If the route is not ready or already cleaned up, then we don't need to 1304 // send keepalives. 1305 if e.route == nil { 1306 return nil 1307 } 1308 e.keepalive.Lock() 1309 if !e.SocketOptions().GetKeepAlive() || e.keepalive.timer.isUninitialized() || !e.keepalive.timer.checkExpiration() { 1310 e.keepalive.Unlock() 1311 return nil 1312 } 1313 1314 // If a userTimeout is set then abort the connection if it is 1315 // exceeded. 1316 if userTimeout != 0 && e.stack.Clock().NowMonotonic().Sub(e.rcv.lastRcvdAckTime) >= userTimeout && e.keepalive.unacked > 0 { 1317 e.keepalive.Unlock() 1318 e.stack.Stats().TCP.EstablishedTimedout.Increment() 1319 return &tcpip.ErrTimeout{} 1320 } 1321 1322 if e.keepalive.unacked >= e.keepalive.count { 1323 e.keepalive.Unlock() 1324 e.stack.Stats().TCP.EstablishedTimedout.Increment() 1325 return &tcpip.ErrTimeout{} 1326 } 1327 1328 // RFC1122 4.2.3.6: TCP keepalive is a dataless ACK with 1329 // seg.seq = snd.nxt-1. 1330 e.keepalive.unacked++ 1331 e.keepalive.Unlock() 1332 e.snd.sendEmptySegment(header.TCPFlagAck, e.snd.SndNxt-1) 1333 e.resetKeepaliveTimer(false) 1334 return nil 1335 } 1336 1337 // resetKeepaliveTimer restarts or stops the keepalive timer, depending on 1338 // whether it is enabled for this endpoint. 1339 func (e *Endpoint) resetKeepaliveTimer(receivedData bool) { 1340 e.keepalive.Lock() 1341 defer e.keepalive.Unlock() 1342 if e.keepalive.timer.isUninitialized() { 1343 if state := e.EndpointState(); !state.closed() { 1344 panic(fmt.Sprintf("Unexpected state when the keepalive time is cleaned up, got %s, want %s or %s", state, StateClose, StateError)) 1345 } 1346 return 1347 } 1348 if receivedData { 1349 e.keepalive.unacked = 0 1350 } 1351 // Start the keepalive timer IFF it's enabled and there is no pending 1352 // data to send. 1353 if !e.SocketOptions().GetKeepAlive() || e.snd == nil || e.snd.SndUna != e.snd.SndNxt { 1354 e.keepalive.timer.disable() 1355 return 1356 } 1357 if e.keepalive.unacked > 0 { 1358 e.keepalive.timer.enable(e.keepalive.interval) 1359 } else { 1360 e.keepalive.timer.enable(e.keepalive.idle) 1361 } 1362 } 1363 1364 // disableKeepaliveTimer stops the keepalive timer. 1365 func (e *Endpoint) disableKeepaliveTimer() { 1366 e.keepalive.Lock() 1367 e.keepalive.timer.disable() 1368 e.keepalive.Unlock() 1369 } 1370 1371 // finWait2TimerExpired is called when the FIN-WAIT-2 timeout is hit 1372 // and the peer hasn't sent us a FIN. 1373 func (e *Endpoint) finWait2TimerExpired() { 1374 e.mu.Lock() 1375 e.transitionToStateCloseLocked() 1376 e.mu.Unlock() 1377 e.drainClosingSegmentQueue() 1378 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 1379 } 1380 1381 // +checklocks:e.mu 1382 func (e *Endpoint) handshakeFailed(err tcpip.Error) { 1383 e.lastErrorMu.Lock() 1384 e.lastError = err 1385 e.lastErrorMu.Unlock() 1386 // handshakeFailed is also called from startHandshake when a listener 1387 // transitions out of Listen state by the time the SYN is processed. In 1388 // such cases the handshake is never initialized and the newly created 1389 // endpoint is closed right away. 1390 if e.h != nil && e.h.retransmitTimer != nil { 1391 e.h.retransmitTimer.stop() 1392 } 1393 e.hardError = err 1394 e.cleanupLocked() 1395 e.setEndpointState(StateError) 1396 } 1397 1398 // handleTimeWaitSegments processes segments received during TIME_WAIT 1399 // state. 1400 // +checklocks:e.mu 1401 // +checklocksalias:e.rcv.ep.mu=e.mu 1402 func (e *Endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()) { 1403 for i := 0; i < maxSegmentsPerWake; i++ { 1404 s := e.segmentQueue.dequeue() 1405 if s == nil { 1406 break 1407 } 1408 extTW, newSyn := e.rcv.handleTimeWaitSegment(s) 1409 if newSyn { 1410 info := e.TransportEndpointInfo 1411 newID := info.ID 1412 newID.RemoteAddress = tcpip.Address{} 1413 newID.RemotePort = 0 1414 netProtos := []tcpip.NetworkProtocolNumber{info.NetProto} 1415 // If the local address is an IPv4 address then also 1416 // look for IPv6 dual stack endpoints that might be 1417 // listening on the local address. 1418 if newID.LocalAddress.To4() != (tcpip.Address{}) { 1419 netProtos = []tcpip.NetworkProtocolNumber{header.IPv4ProtocolNumber, header.IPv6ProtocolNumber} 1420 } 1421 for _, netProto := range netProtos { 1422 if listenEP := e.stack.FindTransportEndpoint(netProto, info.TransProto, newID, s.pkt.NICID); listenEP != nil { 1423 tcpEP := listenEP.(*Endpoint) 1424 if EndpointState(tcpEP.State()) == StateListen { 1425 reuseTW = func() { 1426 if !tcpEP.enqueueSegment(s) { 1427 return 1428 } 1429 tcpEP.notifyProcessor() 1430 s.DecRef() 1431 } 1432 // We explicitly do not DecRef the segment as it's still valid and 1433 // being reflected to a listening endpoint. 1434 return false, reuseTW 1435 } 1436 } 1437 } 1438 } 1439 if extTW { 1440 extendTimeWait = true 1441 } 1442 s.DecRef() 1443 } 1444 return extendTimeWait, nil 1445 } 1446 1447 // +checklocks:e.mu 1448 func (e *Endpoint) getTimeWaitDuration() time.Duration { 1449 timeWaitDuration := DefaultTCPTimeWaitTimeout 1450 1451 // Get the stack wide configuration. 1452 var tcpTW tcpip.TCPTimeWaitTimeoutOption 1453 if err := e.stack.TransportProtocolOption(ProtocolNumber, &tcpTW); err == nil { 1454 timeWaitDuration = time.Duration(tcpTW) 1455 } 1456 return timeWaitDuration 1457 } 1458 1459 // timeWaitTimerExpired is called when an endpoint completes the required time 1460 // (typically 2 * MSL unless configured to something else at a stack level) in 1461 // TIME-WAIT state. 1462 func (e *Endpoint) timeWaitTimerExpired() { 1463 e.mu.Lock() 1464 if e.EndpointState() != StateTimeWait { 1465 e.mu.Unlock() 1466 return 1467 } 1468 e.transitionToStateCloseLocked() 1469 e.mu.Unlock() 1470 e.drainClosingSegmentQueue() 1471 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 1472 } 1473 1474 // notifyProcessor queues this endpoint for processing to its TCP processor. 1475 func (e *Endpoint) notifyProcessor() { 1476 // We use TryLock here to avoid deadlocks in cases where a listening endpoint that is being 1477 // closed tries to abort half completed connections which in turn try to queue any segments 1478 // queued to that endpoint back to the same listening endpoint (because it may have got 1479 // segments that matched its id but were either a RST or a new SYN which must be handled 1480 // by a listening endpoint). In such cases the Close() on the listening endpoint will handle 1481 // any queued segments after it releases the lock. 1482 if !e.mu.TryLock() { 1483 return 1484 } 1485 processor := e.protocol.dispatcher.selectProcessor(e.ID) 1486 e.mu.Unlock() 1487 processor.queueEndpoint(e) 1488 }