github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/tcpip/transport/tcp/connect.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "encoding/binary" 19 "fmt" 20 "math" 21 "time" 22 23 "github.com/nicocha30/gvisor-ligolo/pkg/sync" 24 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip" 25 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/checksum" 26 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/hash/jenkins" 27 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/header" 28 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/seqnum" 29 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/stack" 30 "github.com/nicocha30/gvisor-ligolo/pkg/waiter" 31 ) 32 33 // InitialRTO is the initial retransmission timeout. 34 // https://github.com/torvalds/linux/blob/7c636d4d20f/include/net/tcp.h#L142 35 const InitialRTO = time.Second 36 37 // maxSegmentsPerWake is the maximum number of segments to process in the main 38 // protocol goroutine per wake-up. Yielding [after this number of segments are 39 // processed] allows other events to be processed as well (e.g., timeouts, 40 // resets, etc.). 41 const maxSegmentsPerWake = 100 42 43 type handshakeState int 44 45 // The following are the possible states of the TCP connection during a 3-way 46 // handshake. A depiction of the states and transitions can be found in RFC 793, 47 // page 23. 48 const ( 49 handshakeSynSent handshakeState = iota 50 handshakeSynRcvd 51 handshakeCompleted 52 ) 53 54 const ( 55 // Maximum space available for options. 56 maxOptionSize = 40 57 ) 58 59 // handshake holds the state used during a TCP 3-way handshake. 60 // 61 // NOTE: handshake.ep.mu is held during handshake processing. It is released if 62 // we are going to block and reacquired when we start processing an event. 63 // 64 // +stateify savable 65 type handshake struct { 66 ep *endpoint 67 listenEP *endpoint 68 state handshakeState 69 active bool 70 flags header.TCPFlags 71 ackNum seqnum.Value 72 73 // iss is the initial send sequence number, as defined in RFC 793. 74 iss seqnum.Value 75 76 // rcvWnd is the receive window, as defined in RFC 793. 77 rcvWnd seqnum.Size 78 79 // sndWnd is the send window, as defined in RFC 793. 80 sndWnd seqnum.Size 81 82 // mss is the maximum segment size received from the peer. 83 mss uint16 84 85 // sndWndScale is the send window scale, as defined in RFC 1323. A 86 // negative value means no scaling is supported by the peer. 87 sndWndScale int 88 89 // rcvWndScale is the receive window scale, as defined in RFC 1323. 90 rcvWndScale int 91 92 // startTime is the time at which the first SYN/SYN-ACK was sent. 93 startTime tcpip.MonotonicTime 94 95 // deferAccept if non-zero will drop the final ACK for a passive 96 // handshake till an ACK segment with data is received or the timeout is 97 // hit. 98 deferAccept time.Duration 99 100 // acked is true if the the final ACK for a 3-way handshake has 101 // been received. This is required to stop retransmitting the 102 // original SYN-ACK when deferAccept is enabled. 103 acked bool 104 105 // sendSYNOpts is the cached values for the SYN options to be sent. 106 sendSYNOpts header.TCPSynOptions 107 108 // sampleRTTWithTSOnly is true when the segment was retransmitted or we can't 109 // tell; then RTT can only be sampled when the incoming segment has timestamp 110 // options enabled. 111 sampleRTTWithTSOnly bool 112 113 // retransmitTimer is used to retransmit SYN/SYN-ACK with exponential backoff 114 // till handshake is either completed or timesout. 115 retransmitTimer *backoffTimer `state:"nosave"` 116 } 117 118 // maybeFailTimerHandler takes a handler function for a timer that may fail and 119 // returns a function that will invoke the provided handler with the endpoint 120 // mutex held. In addition the returned function will perform any cleanup that 121 // maybe required if the timer handler returns an error and in case of no errors 122 // will notify the processor if there are pending segments that need to be 123 // processed. 124 125 // NOTE: e.mu is held for the duration of the call to f(). 126 func maybeFailTimerHandler(e *endpoint, f func() tcpip.Error) func() { 127 return func() { 128 e.mu.Lock() 129 if err := f(); err != nil { 130 e.lastErrorMu.Lock() 131 // If the handler timed out and we have a lastError recorded (maybe due 132 // to an ICMP message received), promote it to be the hard error. 133 if _, isTimeout := err.(*tcpip.ErrTimeout); e.lastError != nil && isTimeout { 134 e.hardError = e.lastError 135 } else { 136 e.hardError = err 137 } 138 e.lastError = err 139 e.lastErrorMu.Unlock() 140 e.cleanupLocked() 141 e.setEndpointState(StateError) 142 e.mu.Unlock() 143 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 144 return 145 } 146 processor := e.protocol.dispatcher.selectProcessor(e.ID) 147 e.mu.Unlock() 148 149 // notify processor if there are pending segments to be 150 // processed. 151 if !e.segmentQueue.empty() { 152 processor.queueEndpoint(e) 153 } 154 } 155 } 156 157 // timerHandler takes a handler function for a timer that never results in a 158 // connection being aborted and returns a function that will invoke the provided 159 // handler with the endpoint mutex held. In addition the returned function will 160 // notify the processor if there are pending segments that need to be processed 161 // once the handler function completes. 162 // 163 // NOTE: e.mu is held for the duration of the call to f() 164 func timerHandler(e *endpoint, f func()) func() { 165 return func() { 166 e.mu.Lock() 167 f() 168 processor := e.protocol.dispatcher.selectProcessor(e.ID) 169 e.mu.Unlock() 170 // notify processor if there are pending segments to be 171 // processed. 172 if !e.segmentQueue.empty() { 173 processor.queueEndpoint(e) 174 } 175 } 176 } 177 178 // +checklocks:e.mu 179 // +checklocksacquire:h.ep.mu 180 func (e *endpoint) newHandshake() (h *handshake) { 181 h = &handshake{ 182 ep: e, 183 active: true, 184 rcvWnd: seqnum.Size(e.initialReceiveWindow()), 185 rcvWndScale: e.rcvWndScaleForHandshake(), 186 } 187 h.ep.AssertLockHeld(e) 188 h.resetState() 189 // Store reference to handshake state in endpoint. 190 e.h = h 191 // By the time handshake is created, e.ID is already initialized. 192 e.TSOffset = e.protocol.tsOffset(e.ID.LocalAddress, e.ID.RemoteAddress) 193 timer, err := newBackoffTimer(h.ep.stack.Clock(), InitialRTO, MaxRTO, maybeFailTimerHandler(e, h.retransmitHandlerLocked)) 194 if err != nil { 195 panic(fmt.Sprintf("newBackOffTimer(_, %s, %s, _) failed: %s", InitialRTO, MaxRTO, err)) 196 } 197 h.retransmitTimer = timer 198 return h 199 } 200 201 // +checklocks:e.mu 202 // +checklocksacquire:h.ep.mu 203 func (e *endpoint) newPassiveHandshake(isn, irs seqnum.Value, opts header.TCPSynOptions, deferAccept time.Duration) (h *handshake) { 204 h = e.newHandshake() 205 h.resetToSynRcvd(isn, irs, opts, deferAccept) 206 return h 207 } 208 209 // FindWndScale determines the window scale to use for the given maximum window 210 // size. 211 func FindWndScale(wnd seqnum.Size) int { 212 if wnd < 0x10000 { 213 return 0 214 } 215 216 max := seqnum.Size(math.MaxUint16) 217 s := 0 218 for wnd > max && s < header.MaxWndScale { 219 s++ 220 max <<= 1 221 } 222 223 return s 224 } 225 226 // resetState resets the state of the handshake object such that it becomes 227 // ready for a new 3-way handshake. 228 func (h *handshake) resetState() { 229 h.state = handshakeSynSent 230 h.flags = header.TCPFlagSyn 231 h.ackNum = 0 232 h.mss = 0 233 h.iss = generateSecureISN(h.ep.TransportEndpointInfo.ID, h.ep.stack.Clock(), h.ep.protocol.seqnumSecret) 234 } 235 236 // generateSecureISN generates a secure Initial Sequence number based on the 237 // recommendation here https://tools.ietf.org/html/rfc6528#page-3. 238 func generateSecureISN(id stack.TransportEndpointID, clock tcpip.Clock, seed uint32) seqnum.Value { 239 isnHasher := jenkins.Sum32(seed) 240 // Per hash.Hash.Writer: 241 // 242 // It never returns an error. 243 _, _ = isnHasher.Write(id.LocalAddress.AsSlice()) 244 _, _ = isnHasher.Write(id.RemoteAddress.AsSlice()) 245 portBuf := make([]byte, 2) 246 binary.LittleEndian.PutUint16(portBuf, id.LocalPort) 247 _, _ = isnHasher.Write(portBuf) 248 binary.LittleEndian.PutUint16(portBuf, id.RemotePort) 249 _, _ = isnHasher.Write(portBuf) 250 // The time period here is 64ns. This is similar to what linux uses 251 // generate a sequence number that overlaps less than one 252 // time per MSL (2 minutes). 253 // 254 // A 64ns clock ticks 10^9/64 = 15625000) times in a second. 255 // To wrap the whole 32 bit space would require 256 // 2^32/1562500 ~ 274 seconds. 257 // 258 // Which sort of guarantees that we won't reuse the ISN for a new 259 // connection for the same tuple for at least 274s. 260 isn := isnHasher.Sum32() + uint32(clock.NowMonotonic().Sub(tcpip.MonotonicTime{}).Nanoseconds()>>6) 261 return seqnum.Value(isn) 262 } 263 264 // effectiveRcvWndScale returns the effective receive window scale to be used. 265 // If the peer doesn't support window scaling, the effective rcv wnd scale is 266 // zero; otherwise it's the value calculated based on the initial rcv wnd. 267 func (h *handshake) effectiveRcvWndScale() uint8 { 268 if h.sndWndScale < 0 { 269 return 0 270 } 271 return uint8(h.rcvWndScale) 272 } 273 274 // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD 275 // state. 276 // +checklocks:h.ep.mu 277 func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts header.TCPSynOptions, deferAccept time.Duration) { 278 h.active = false 279 h.state = handshakeSynRcvd 280 h.flags = header.TCPFlagSyn | header.TCPFlagAck 281 h.iss = iss 282 h.ackNum = irs + 1 283 h.mss = opts.MSS 284 h.sndWndScale = opts.WS 285 h.deferAccept = deferAccept 286 h.ep.setEndpointState(StateSynRecv) 287 } 288 289 // checkAck checks if the ACK number, if present, of a segment received during 290 // a TCP 3-way handshake is valid. If it's not, a RST segment is sent back in 291 // response. 292 func (h *handshake) checkAck(s *segment) bool { 293 if s.flags.Contains(header.TCPFlagAck) && s.ackNumber != h.iss+1 { 294 // RFC 793, page 72 (https://datatracker.ietf.org/doc/html/rfc793#page-72): 295 // If the segment acknowledgment is not acceptable, form a reset segment, 296 // <SEQ=SEG.ACK><CTL=RST> 297 // and send it. 298 h.ep.sendEmptyRaw(header.TCPFlagRst, s.ackNumber, 0, 0) 299 return false 300 } 301 302 return true 303 } 304 305 // synSentState handles a segment received when the TCP 3-way handshake is in 306 // the SYN-SENT state. 307 // +checklocks:h.ep.mu 308 func (h *handshake) synSentState(s *segment) tcpip.Error { 309 // RFC 793, page 37, states that in the SYN-SENT state, a reset is 310 // acceptable if the ack field acknowledges the SYN. 311 if s.flags.Contains(header.TCPFlagRst) { 312 if s.flags.Contains(header.TCPFlagAck) && s.ackNumber == h.iss+1 { 313 // RFC 793, page 67, states that "If the RST bit is set [and] If the ACK 314 // was acceptable then signal the user "error: connection reset", drop 315 // the segment, enter CLOSED state, delete TCB, and return." 316 // Although the RFC above calls out ECONNRESET, Linux actually returns 317 // ECONNREFUSED here so we do as well. 318 return &tcpip.ErrConnectionRefused{} 319 } 320 return nil 321 } 322 323 if !h.checkAck(s) { 324 return nil 325 } 326 327 // We are in the SYN-SENT state. We only care about segments that have 328 // the SYN flag. 329 if !s.flags.Contains(header.TCPFlagSyn) { 330 return nil 331 } 332 333 // Parse the SYN options. 334 rcvSynOpts := parseSynSegmentOptions(s) 335 336 // Remember if the Timestamp option was negotiated. 337 h.ep.maybeEnableTimestamp(rcvSynOpts) 338 339 // Remember if the SACKPermitted option was negotiated. 340 h.ep.maybeEnableSACKPermitted(rcvSynOpts) 341 342 // Remember the sequence we'll ack from now on. 343 h.ackNum = s.sequenceNumber + 1 344 h.flags |= header.TCPFlagAck 345 h.mss = rcvSynOpts.MSS 346 h.sndWndScale = rcvSynOpts.WS 347 348 // If this is a SYN ACK response, we only need to acknowledge the SYN 349 // and the handshake is completed. 350 if s.flags.Contains(header.TCPFlagAck) { 351 h.state = handshakeCompleted 352 h.transitionToStateEstablishedLocked(s) 353 354 h.ep.sendEmptyRaw(header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale()) 355 return nil 356 } 357 358 // A SYN segment was received, but no ACK in it. We acknowledge the SYN 359 // but resend our own SYN and wait for it to be acknowledged in the 360 // SYN-RCVD state. 361 h.state = handshakeSynRcvd 362 ttl := calculateTTL(h.ep.route, h.ep.ipv4TTL, h.ep.ipv6HopLimit) 363 amss := h.ep.amss 364 h.ep.setEndpointState(StateSynRecv) 365 synOpts := header.TCPSynOptions{ 366 WS: int(h.effectiveRcvWndScale()), 367 TS: rcvSynOpts.TS, 368 TSVal: h.ep.tsValNow(), 369 TSEcr: h.ep.recentTimestamp(), 370 371 // We only send SACKPermitted if the other side indicated it 372 // permits SACK. This is not explicitly defined in the RFC but 373 // this is the behaviour implemented by Linux. 374 SACKPermitted: rcvSynOpts.SACKPermitted, 375 MSS: amss, 376 } 377 if ttl == 0 { 378 ttl = h.ep.route.DefaultTTL() 379 } 380 h.ep.sendSynTCP(h.ep.route, tcpFields{ 381 id: h.ep.TransportEndpointInfo.ID, 382 ttl: ttl, 383 tos: h.ep.sendTOS, 384 flags: h.flags, 385 seq: h.iss, 386 ack: h.ackNum, 387 rcvWnd: h.rcvWnd, 388 }, synOpts) 389 return nil 390 } 391 392 // synRcvdState handles a segment received when the TCP 3-way handshake is in 393 // the SYN-RCVD state. 394 // +checklocks:h.ep.mu 395 func (h *handshake) synRcvdState(s *segment) tcpip.Error { 396 if s.flags.Contains(header.TCPFlagRst) { 397 // RFC 793, page 37, states that in the SYN-RCVD state, a reset 398 // is acceptable if the sequence number is in the window. 399 if s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) { 400 return &tcpip.ErrConnectionRefused{} 401 } 402 return nil 403 } 404 405 if !h.checkAck(s) { 406 return nil 407 } 408 409 // RFC 793, Section 3.9, page 69, states that in the SYN-RCVD state, a 410 // sequence number outside of the window causes an ACK with the proper seq 411 // number and "After sending the acknowledgment, drop the unacceptable 412 // segment and return." 413 if !s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) { 414 if h.ep.allowOutOfWindowAck() { 415 h.ep.sendEmptyRaw(header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd) 416 } 417 return nil 418 } 419 420 if s.flags.Contains(header.TCPFlagSyn) && s.sequenceNumber != h.ackNum-1 { 421 // We received two SYN segments with different sequence 422 // numbers, so we reset this and restart the whole 423 // process, except that we don't reset the timer. 424 ack := s.sequenceNumber.Add(s.logicalLen()) 425 seq := seqnum.Value(0) 426 if s.flags.Contains(header.TCPFlagAck) { 427 seq = s.ackNumber 428 } 429 h.ep.sendEmptyRaw(header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0) 430 431 if !h.active { 432 return &tcpip.ErrInvalidEndpointState{} 433 } 434 435 h.resetState() 436 synOpts := header.TCPSynOptions{ 437 WS: h.rcvWndScale, 438 TS: h.ep.SendTSOk, 439 TSVal: h.ep.tsValNow(), 440 TSEcr: h.ep.recentTimestamp(), 441 SACKPermitted: h.ep.SACKPermitted, 442 MSS: h.ep.amss, 443 } 444 h.ep.sendSynTCP(h.ep.route, tcpFields{ 445 id: h.ep.TransportEndpointInfo.ID, 446 ttl: calculateTTL(h.ep.route, h.ep.ipv4TTL, h.ep.ipv6HopLimit), 447 tos: h.ep.sendTOS, 448 flags: h.flags, 449 seq: h.iss, 450 ack: h.ackNum, 451 rcvWnd: h.rcvWnd, 452 }, synOpts) 453 return nil 454 } 455 456 // We have previously received (and acknowledged) the peer's SYN. If the 457 // peer acknowledges our SYN, the handshake is completed. 458 if s.flags.Contains(header.TCPFlagAck) { 459 // If deferAccept is not zero and this is a bare ACK and the 460 // timeout is not hit then drop the ACK. 461 if h.deferAccept != 0 && s.payloadSize() == 0 && h.ep.stack.Clock().NowMonotonic().Sub(h.startTime) < h.deferAccept { 462 h.acked = true 463 h.ep.stack.Stats().DroppedPackets.Increment() 464 return nil 465 } 466 467 // If the timestamp option is negotiated and the segment does 468 // not carry a timestamp option then the segment must be dropped 469 // as per https://tools.ietf.org/html/rfc7323#section-3.2. 470 if h.ep.SendTSOk && !s.parsedOptions.TS { 471 h.ep.stack.Stats().DroppedPackets.Increment() 472 return nil 473 } 474 475 // Drop the ACK if the accept queue is full. 476 // https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_ipv4.c#L1523 477 // We could abort the connection as well with a tunable as in 478 // https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_minisocks.c#L788 479 if listenEP := h.listenEP; listenEP != nil && listenEP.acceptQueueIsFull() { 480 listenEP.stack.Stats().DroppedPackets.Increment() 481 return nil 482 } 483 484 // Update timestamp if required. See RFC7323, section-4.3. 485 if h.ep.SendTSOk && s.parsedOptions.TS { 486 h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber) 487 } 488 489 h.state = handshakeCompleted 490 h.transitionToStateEstablishedLocked(s) 491 492 // Requeue the segment if the ACK completing the handshake has more info 493 // to be processed by the newly established endpoint. 494 if (s.flags.Contains(header.TCPFlagFin) || s.payloadSize() > 0) && h.ep.enqueueSegment(s) { 495 h.ep.protocol.dispatcher.selectProcessor(h.ep.ID).queueEndpoint(h.ep) 496 497 } 498 return nil 499 } 500 501 return nil 502 } 503 504 // +checklocks:h.ep.mu 505 func (h *handshake) handleSegment(s *segment) tcpip.Error { 506 h.sndWnd = s.window 507 if !s.flags.Contains(header.TCPFlagSyn) && h.sndWndScale > 0 { 508 h.sndWnd <<= uint8(h.sndWndScale) 509 } 510 511 switch h.state { 512 case handshakeSynRcvd: 513 return h.synRcvdState(s) 514 case handshakeSynSent: 515 return h.synSentState(s) 516 } 517 return nil 518 } 519 520 // processSegments goes through the segment queue and processes up to 521 // maxSegmentsPerWake (if they're available). 522 // +checklocks:h.ep.mu 523 func (h *handshake) processSegments() tcpip.Error { 524 for i := 0; i < maxSegmentsPerWake; i++ { 525 s := h.ep.segmentQueue.dequeue() 526 if s == nil { 527 return nil 528 } 529 530 err := h.handleSegment(s) 531 s.DecRef() 532 if err != nil { 533 return err 534 } 535 536 // We stop processing packets once the handshake is completed, 537 // otherwise we may process packets meant to be processed by 538 // the main protocol goroutine. 539 if h.state == handshakeCompleted { 540 break 541 } 542 } 543 544 return nil 545 } 546 547 // start sends the first SYN/SYN-ACK. It does not block, even if link address 548 // resolution is required. 549 func (h *handshake) start() { 550 h.startTime = h.ep.stack.Clock().NowMonotonic() 551 h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route) 552 var sackEnabled tcpip.TCPSACKEnabled 553 if err := h.ep.stack.TransportProtocolOption(ProtocolNumber, &sackEnabled); err != nil { 554 // If stack returned an error when checking for SACKEnabled 555 // status then just default to switching off SACK negotiation. 556 sackEnabled = false 557 } 558 559 synOpts := header.TCPSynOptions{ 560 WS: h.rcvWndScale, 561 TS: true, 562 TSVal: h.ep.tsValNow(), 563 TSEcr: h.ep.recentTimestamp(), 564 SACKPermitted: bool(sackEnabled), 565 MSS: h.ep.amss, 566 } 567 568 // start() is also called in a listen context so we want to make sure we only 569 // send the TS/SACK option when we received the TS/SACK in the initial SYN. 570 if h.state == handshakeSynRcvd { 571 synOpts.TS = h.ep.SendTSOk 572 synOpts.SACKPermitted = h.ep.SACKPermitted && bool(sackEnabled) 573 if h.sndWndScale < 0 { 574 // Disable window scaling if the peer did not send us 575 // the window scaling option. 576 synOpts.WS = -1 577 } 578 } 579 580 h.sendSYNOpts = synOpts 581 h.ep.sendSynTCP(h.ep.route, tcpFields{ 582 id: h.ep.TransportEndpointInfo.ID, 583 ttl: calculateTTL(h.ep.route, h.ep.ipv4TTL, h.ep.ipv6HopLimit), 584 tos: h.ep.sendTOS, 585 flags: h.flags, 586 seq: h.iss, 587 ack: h.ackNum, 588 rcvWnd: h.rcvWnd, 589 }, synOpts) 590 } 591 592 // retransmitHandler handles retransmissions of un-acked SYNs. 593 // +checklocks:h.ep.mu 594 func (h *handshake) retransmitHandlerLocked() tcpip.Error { 595 e := h.ep 596 // If the endpoint has already transition out of a connecting state due 597 // to say an error (e.g) peer send RST or an ICMP error. Then just 598 // return. Any required cleanup should have been done when the RST/error 599 // was handled. 600 if !e.EndpointState().connecting() { 601 return nil 602 } 603 604 if err := h.retransmitTimer.reset(); err != nil { 605 return err 606 } 607 608 // Resend the SYN/SYN-ACK only if the following conditions hold. 609 // - It's an active handshake (deferAccept does not apply) 610 // - It's a passive handshake and we have not yet got the final-ACK. 611 // - It's a passive handshake and we got an ACK but deferAccept is 612 // enabled and we are now past the deferAccept duration. 613 // The last is required to provide a way for the peer to complete 614 // the connection with another ACK or data (as ACKs are never 615 // retransmitted on their own). 616 if h.active || !h.acked || h.deferAccept != 0 && e.stack.Clock().NowMonotonic().Sub(h.startTime) > h.deferAccept { 617 e.sendSynTCP(e.route, tcpFields{ 618 id: e.TransportEndpointInfo.ID, 619 ttl: calculateTTL(e.route, e.ipv4TTL, e.ipv6HopLimit), 620 tos: e.sendTOS, 621 flags: h.flags, 622 seq: h.iss, 623 ack: h.ackNum, 624 rcvWnd: h.rcvWnd, 625 }, h.sendSYNOpts) 626 // If we have ever retransmitted the SYN-ACK or 627 // SYN segment, we should only measure RTT if 628 // TS option is present. 629 h.sampleRTTWithTSOnly = true 630 } 631 return nil 632 } 633 634 // transitionToStateEstablisedLocked transitions the endpoint of the handshake 635 // to an established state given the last segment received from peer. It also 636 // initializes sender/receiver. 637 // +checklocks:h.ep.mu 638 func (h *handshake) transitionToStateEstablishedLocked(s *segment) { 639 // Stop the SYN retransmissions now that handshake is complete. 640 if h.retransmitTimer != nil { 641 h.retransmitTimer.stop() 642 } 643 644 // Transfer handshake state to TCP connection. We disable 645 // receive window scaling if the peer doesn't support it 646 // (indicated by a negative send window scale). 647 h.ep.snd = newSender(h.ep, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale) 648 649 now := h.ep.stack.Clock().NowMonotonic() 650 651 var rtt time.Duration 652 if h.ep.SendTSOk && s.parsedOptions.TSEcr != 0 { 653 rtt = h.ep.elapsed(now, s.parsedOptions.TSEcr) 654 } 655 if !h.sampleRTTWithTSOnly && rtt == 0 { 656 rtt = now.Sub(h.startTime) 657 } 658 659 if rtt > 0 { 660 h.ep.snd.updateRTO(rtt) 661 } 662 663 h.ep.rcvQueueMu.Lock() 664 h.ep.rcv = newReceiver(h.ep, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale()) 665 // Bootstrap the auto tuning algorithm. Starting at zero will 666 // result in a really large receive window after the first auto 667 // tuning adjustment. 668 h.ep.RcvAutoParams.PrevCopiedBytes = int(h.rcvWnd) 669 h.ep.rcvQueueMu.Unlock() 670 671 h.ep.setEndpointState(StateEstablished) 672 673 // Completing the 3-way handshake is an indication that the route is valid 674 // and the remote is reachable as the only way we can complete a handshake 675 // is if our SYN reached the remote and their ACK reached us. 676 h.ep.route.ConfirmReachable() 677 678 // Tell waiters that the endpoint is connected and writable. 679 h.ep.waiterQueue.Notify(waiter.WritableEvents) 680 } 681 682 type backoffTimer struct { 683 timeout time.Duration 684 maxTimeout time.Duration 685 t tcpip.Timer 686 } 687 688 func newBackoffTimer(clock tcpip.Clock, timeout, maxTimeout time.Duration, f func()) (*backoffTimer, tcpip.Error) { 689 if timeout > maxTimeout { 690 return nil, &tcpip.ErrTimeout{} 691 } 692 bt := &backoffTimer{timeout: timeout, maxTimeout: maxTimeout} 693 bt.t = clock.AfterFunc(timeout, f) 694 return bt, nil 695 } 696 697 func (bt *backoffTimer) reset() tcpip.Error { 698 bt.timeout *= 2 699 if bt.timeout > bt.maxTimeout { 700 return &tcpip.ErrTimeout{} 701 } 702 bt.t.Reset(bt.timeout) 703 return nil 704 } 705 706 func (bt *backoffTimer) stop() { 707 bt.t.Stop() 708 } 709 710 func parseSynSegmentOptions(s *segment) header.TCPSynOptions { 711 synOpts := header.ParseSynOptions(s.options, s.flags.Contains(header.TCPFlagAck)) 712 if synOpts.TS { 713 s.parsedOptions.TSVal = synOpts.TSVal 714 s.parsedOptions.TSEcr = synOpts.TSEcr 715 } 716 return synOpts 717 } 718 719 var optionPool = sync.Pool{ 720 New: func() any { 721 return &[maxOptionSize]byte{} 722 }, 723 } 724 725 func getOptions() []byte { 726 return (*optionPool.Get().(*[maxOptionSize]byte))[:] 727 } 728 729 func putOptions(options []byte) { 730 // Reslice to full capacity. 731 optionPool.Put(optionsToArray(options)) 732 } 733 734 func makeSynOptions(opts header.TCPSynOptions) []byte { 735 // Emulate linux option order. This is as follows: 736 // 737 // if md5: NOP NOP MD5SIG 18 md5sig(16) 738 // if mss: MSS 4 mss(2) 739 // if ts and sack_advertise: 740 // SACK 2 TIMESTAMP 2 timestamp(8) 741 // elif ts: NOP NOP TIMESTAMP 10 timestamp(8) 742 // elif sack: NOP NOP SACK 2 743 // if wscale: NOP WINDOW 3 ws(1) 744 // if sack_blocks: NOP NOP SACK ((2 + (#blocks * 8)) 745 // [for each block] start_seq(4) end_seq(4) 746 // if fastopen_cookie: 747 // if exp: EXP (4 + len(cookie)) FASTOPEN_MAGIC(2) 748 // else: FASTOPEN (2 + len(cookie)) 749 // cookie(variable) [padding to four bytes] 750 // 751 options := getOptions() 752 753 // Always encode the mss. 754 offset := header.EncodeMSSOption(uint32(opts.MSS), options) 755 756 // Special ordering is required here. If both TS and SACK are enabled, 757 // then the SACK option precedes TS, with no padding. If they are 758 // enabled individually, then we see padding before the option. 759 if opts.TS && opts.SACKPermitted { 760 offset += header.EncodeSACKPermittedOption(options[offset:]) 761 offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:]) 762 } else if opts.TS { 763 offset += header.EncodeNOP(options[offset:]) 764 offset += header.EncodeNOP(options[offset:]) 765 offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:]) 766 } else if opts.SACKPermitted { 767 offset += header.EncodeNOP(options[offset:]) 768 offset += header.EncodeNOP(options[offset:]) 769 offset += header.EncodeSACKPermittedOption(options[offset:]) 770 } 771 772 // Initialize the WS option. 773 if opts.WS >= 0 { 774 offset += header.EncodeNOP(options[offset:]) 775 offset += header.EncodeWSOption(opts.WS, options[offset:]) 776 } 777 778 // Padding to the end; note that this never apply unless we add a 779 // fastopen option, we always expect the offset to remain the same. 780 if delta := header.AddTCPOptionPadding(options, offset); delta != 0 { 781 panic("unexpected option encoding") 782 } 783 784 return options[:offset] 785 } 786 787 // tcpFields is a struct to carry different parameters required by the 788 // send*TCP variant functions below. 789 type tcpFields struct { 790 id stack.TransportEndpointID 791 ttl uint8 792 tos uint8 793 flags header.TCPFlags 794 seq seqnum.Value 795 ack seqnum.Value 796 rcvWnd seqnum.Size 797 opts []byte 798 txHash uint32 799 } 800 801 func (e *endpoint) sendSynTCP(r *stack.Route, tf tcpFields, opts header.TCPSynOptions) tcpip.Error { 802 tf.opts = makeSynOptions(opts) 803 // We ignore SYN send errors and let the callers re-attempt send. 804 p := stack.NewPacketBuffer(stack.PacketBufferOptions{ReserveHeaderBytes: header.TCPMinimumSize + int(r.MaxHeaderLength()) + len(tf.opts)}) 805 defer p.DecRef() 806 if err := e.sendTCP(r, tf, p, stack.GSO{}); err != nil { 807 e.stats.SendErrors.SynSendToNetworkFailed.Increment() 808 } 809 putOptions(tf.opts) 810 return nil 811 } 812 813 // This method takes ownership of pkt. 814 func (e *endpoint) sendTCP(r *stack.Route, tf tcpFields, pkt stack.PacketBufferPtr, gso stack.GSO) tcpip.Error { 815 tf.txHash = e.txHash 816 if err := sendTCP(r, tf, pkt, gso, e.owner); err != nil { 817 e.stats.SendErrors.SegmentSendToNetworkFailed.Increment() 818 return err 819 } 820 e.stats.SegmentsSent.Increment() 821 return nil 822 } 823 824 func buildTCPHdr(r *stack.Route, tf tcpFields, pkt stack.PacketBufferPtr, gso stack.GSO) { 825 optLen := len(tf.opts) 826 tcp := header.TCP(pkt.TransportHeader().Push(header.TCPMinimumSize + optLen)) 827 pkt.TransportProtocolNumber = header.TCPProtocolNumber 828 tcp.Encode(&header.TCPFields{ 829 SrcPort: tf.id.LocalPort, 830 DstPort: tf.id.RemotePort, 831 SeqNum: uint32(tf.seq), 832 AckNum: uint32(tf.ack), 833 DataOffset: uint8(header.TCPMinimumSize + optLen), 834 Flags: tf.flags, 835 WindowSize: uint16(tf.rcvWnd), 836 }) 837 copy(tcp[header.TCPMinimumSize:], tf.opts) 838 839 xsum := r.PseudoHeaderChecksum(ProtocolNumber, uint16(pkt.Size())) 840 // Only calculate the checksum if offloading isn't supported. 841 if gso.Type != stack.GSONone && gso.NeedsCsum { 842 // This is called CHECKSUM_PARTIAL in the Linux kernel. We 843 // calculate a checksum of the pseudo-header and save it in the 844 // TCP header, then the kernel calculate a checksum of the 845 // header and data and get the right sum of the TCP packet. 846 tcp.SetChecksum(xsum) 847 } else if r.RequiresTXTransportChecksum() { 848 xsum = checksum.Combine(xsum, pkt.Data().Checksum()) 849 tcp.SetChecksum(^tcp.CalculateChecksum(xsum)) 850 } 851 } 852 853 func sendTCPBatch(r *stack.Route, tf tcpFields, pkt stack.PacketBufferPtr, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error { 854 optLen := len(tf.opts) 855 if tf.rcvWnd > math.MaxUint16 { 856 tf.rcvWnd = math.MaxUint16 857 } 858 859 mss := int(gso.MSS) 860 n := (pkt.Data().Size() + mss - 1) / mss 861 862 size := pkt.Data().Size() 863 hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen 864 for i := 0; i < n; i++ { 865 packetSize := mss 866 if packetSize > size { 867 packetSize = size 868 } 869 size -= packetSize 870 871 pkt := pkt 872 // No need to split the packet in the final iteration. The original 873 // packet already has the truncated data. 874 shouldSplitPacket := i != n-1 875 if shouldSplitPacket { 876 splitPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ReserveHeaderBytes: hdrSize}) 877 splitPkt.Data().ReadFromPacketData(pkt.Data(), packetSize) 878 pkt = splitPkt 879 } 880 pkt.Hash = tf.txHash 881 pkt.Owner = owner 882 883 buildTCPHdr(r, tf, pkt, gso) 884 tf.seq = tf.seq.Add(seqnum.Size(packetSize)) 885 pkt.GSOOptions = gso 886 if err := r.WritePacket(stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos}, pkt); err != nil { 887 r.Stats().TCP.SegmentSendErrors.Increment() 888 if shouldSplitPacket { 889 pkt.DecRef() 890 } 891 return err 892 } 893 r.Stats().TCP.SegmentsSent.Increment() 894 if shouldSplitPacket { 895 pkt.DecRef() 896 } 897 } 898 return nil 899 } 900 901 // sendTCP sends a TCP segment with the provided options via the provided 902 // network endpoint and under the provided identity. This method takes 903 // ownership of pkt. 904 func sendTCP(r *stack.Route, tf tcpFields, pkt stack.PacketBufferPtr, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error { 905 if tf.rcvWnd > math.MaxUint16 { 906 tf.rcvWnd = math.MaxUint16 907 } 908 909 if r.Loop()&stack.PacketLoop == 0 && gso.Type == stack.GSOGvisor && int(gso.MSS) < pkt.Data().Size() { 910 return sendTCPBatch(r, tf, pkt, gso, owner) 911 } 912 913 pkt.GSOOptions = gso 914 pkt.Hash = tf.txHash 915 pkt.Owner = owner 916 buildTCPHdr(r, tf, pkt, gso) 917 918 if err := r.WritePacket(stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos}, pkt); err != nil { 919 r.Stats().TCP.SegmentSendErrors.Increment() 920 return err 921 } 922 r.Stats().TCP.SegmentsSent.Increment() 923 if (tf.flags & header.TCPFlagRst) != 0 { 924 r.Stats().TCP.ResetsSent.Increment() 925 } 926 return nil 927 } 928 929 // makeOptions makes an options slice. 930 func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte { 931 options := getOptions() 932 offset := 0 933 934 // N.B. the ordering here matches the ordering used by Linux internally 935 // and described in the raw makeOptions function. We don't include 936 // unnecessary cases here (post connection.) 937 if e.SendTSOk { 938 // Embed the timestamp if timestamp has been enabled. 939 // 940 // We only use the lower 32 bits of the unix time in 941 // milliseconds. This is similar to what Linux does where it 942 // uses the lower 32 bits of the jiffies value in the tsVal 943 // field of the timestamp option. 944 // 945 // Further, RFC7323 section-5.4 recommends millisecond 946 // resolution as the lowest recommended resolution for the 947 // timestamp clock. 948 // 949 // Ref: https://tools.ietf.org/html/rfc7323#section-5.4. 950 offset += header.EncodeNOP(options[offset:]) 951 offset += header.EncodeNOP(options[offset:]) 952 offset += header.EncodeTSOption(e.tsValNow(), e.recentTimestamp(), options[offset:]) 953 } 954 if e.SACKPermitted && len(sackBlocks) > 0 { 955 offset += header.EncodeNOP(options[offset:]) 956 offset += header.EncodeNOP(options[offset:]) 957 offset += header.EncodeSACKBlocks(sackBlocks, options[offset:]) 958 } 959 960 // We expect the above to produce an aligned offset. 961 if delta := header.AddTCPOptionPadding(options, offset); delta != 0 { 962 panic("unexpected option encoding") 963 } 964 965 return options[:offset] 966 } 967 968 // sendEmptyRaw sends a TCP segment with no payload to the endpoint's peer. 969 func (e *endpoint) sendEmptyRaw(flags header.TCPFlags, seq, ack seqnum.Value, rcvWnd seqnum.Size) tcpip.Error { 970 pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{}) 971 defer pkt.DecRef() 972 return e.sendRaw(pkt, flags, seq, ack, rcvWnd) 973 } 974 975 // sendRaw sends a TCP segment to the endpoint's peer. This method takes 976 // ownership of pkt. pkt must not have any headers set. 977 func (e *endpoint) sendRaw(pkt stack.PacketBufferPtr, flags header.TCPFlags, seq, ack seqnum.Value, rcvWnd seqnum.Size) tcpip.Error { 978 var sackBlocks []header.SACKBlock 979 if e.EndpointState() == StateEstablished && e.rcv.pendingRcvdSegments.Len() > 0 && (flags&header.TCPFlagAck != 0) { 980 sackBlocks = e.sack.Blocks[:e.sack.NumBlocks] 981 } 982 options := e.makeOptions(sackBlocks) 983 defer putOptions(options) 984 pkt.ReserveHeaderBytes(header.TCPMinimumSize + int(e.route.MaxHeaderLength()) + len(options)) 985 return e.sendTCP(e.route, tcpFields{ 986 id: e.TransportEndpointInfo.ID, 987 ttl: calculateTTL(e.route, e.ipv4TTL, e.ipv6HopLimit), 988 tos: e.sendTOS, 989 flags: flags, 990 seq: seq, 991 ack: ack, 992 rcvWnd: rcvWnd, 993 opts: options, 994 }, pkt, e.gso) 995 } 996 997 // +checklocks:e.mu 998 // +checklocksalias:e.snd.ep.mu=e.mu 999 func (e *endpoint) sendData(next *segment) { 1000 // Initialize the next segment to write if it's currently nil. 1001 if e.snd.writeNext == nil { 1002 if next == nil { 1003 return 1004 } 1005 e.snd.updateWriteNext(next) 1006 } 1007 1008 // Push out any new packets. 1009 e.snd.sendData() 1010 } 1011 1012 // resetConnectionLocked puts the endpoint in an error state with the given 1013 // error code and sends a RST if and only if the error is not ErrConnectionReset 1014 // indicating that the connection is being reset due to receiving a RST. This 1015 // method must only be called from the protocol goroutine. 1016 // +checklocks:e.mu 1017 func (e *endpoint) resetConnectionLocked(err tcpip.Error) { 1018 // Only send a reset if the connection is being aborted for a reason 1019 // other than receiving a reset. 1020 e.hardError = err 1021 switch err.(type) { 1022 case *tcpip.ErrConnectionReset, *tcpip.ErrTimeout: 1023 default: 1024 // The exact sequence number to be used for the RST is the same as the 1025 // one used by Linux. We need to handle the case of window being shrunk 1026 // which can cause sndNxt to be outside the acceptable window on the 1027 // receiver. 1028 // 1029 // See: https://www.snellman.net/blog/archive/2016-02-01-tcp-rst/ for more 1030 // information. 1031 sndWndEnd := e.snd.SndUna.Add(e.snd.SndWnd) 1032 resetSeqNum := sndWndEnd 1033 if !sndWndEnd.LessThan(e.snd.SndNxt) || e.snd.SndNxt.Size(sndWndEnd) < (1<<e.snd.SndWndScale) { 1034 resetSeqNum = e.snd.SndNxt 1035 } 1036 e.sendEmptyRaw(header.TCPFlagAck|header.TCPFlagRst, resetSeqNum, e.rcv.RcvNxt, 0) 1037 } 1038 // Don't purge read queues here. If there's buffered data, it's still allowed 1039 // to be read. 1040 e.purgeWriteQueue() 1041 e.purgePendingRcvQueue() 1042 e.cleanupLocked() 1043 e.setEndpointState(StateError) 1044 } 1045 1046 // transitionToStateCloseLocked ensures that the endpoint is 1047 // cleaned up from the transport demuxer, "before" moving to 1048 // StateClose. This will ensure that no packet will be 1049 // delivered to this endpoint from the demuxer when the endpoint 1050 // is transitioned to StateClose. 1051 // +checklocks:e.mu 1052 func (e *endpoint) transitionToStateCloseLocked() { 1053 s := e.EndpointState() 1054 if s == StateClose { 1055 return 1056 } 1057 1058 if s.connected() { 1059 e.stack.Stats().TCP.EstablishedClosed.Increment() 1060 } 1061 1062 e.cleanupLocked() 1063 // Mark the endpoint as fully closed for reads/writes. 1064 e.setEndpointState(StateClose) 1065 } 1066 1067 // tryDeliverSegmentFromClosedEndpoint attempts to deliver the parsed 1068 // segment to any other endpoint other than the current one. This is called 1069 // only when the endpoint is in StateClose and we want to deliver the segment 1070 // to any other listening endpoint. We reply with RST if we cannot find one. 1071 func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) { 1072 ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.TransportEndpointInfo.ID, s.pkt.NICID) 1073 if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.TransportEndpointInfo.ID.LocalAddress.To4() != (tcpip.Address{}) { 1074 // Dual-stack socket, try IPv4. 1075 ep = e.stack.FindTransportEndpoint( 1076 header.IPv4ProtocolNumber, 1077 e.TransProto, 1078 e.TransportEndpointInfo.ID, 1079 s.pkt.NICID, 1080 ) 1081 } 1082 if ep == nil { 1083 if !s.flags.Contains(header.TCPFlagRst) { 1084 replyWithReset(e.stack, s, stack.DefaultTOS, tcpip.UseDefaultIPv4TTL, tcpip.UseDefaultIPv6HopLimit) 1085 } 1086 return 1087 } 1088 1089 if e == ep { 1090 panic(fmt.Sprintf("current endpoint not removed from demuxer, enqueing segments to itself, endpoint in state %v", e.EndpointState())) 1091 } 1092 1093 if ep := ep.(*endpoint); ep.enqueueSegment(s) { 1094 ep.notifyProcessor() 1095 } 1096 } 1097 1098 // Drain segment queue from the endpoint and try to re-match the segment to a 1099 // different endpoint. This is used when the current endpoint is transitioned to 1100 // StateClose and has been unregistered from the transport demuxer. 1101 func (e *endpoint) drainClosingSegmentQueue() { 1102 for { 1103 s := e.segmentQueue.dequeue() 1104 if s == nil { 1105 break 1106 } 1107 1108 e.tryDeliverSegmentFromClosedEndpoint(s) 1109 s.DecRef() 1110 } 1111 } 1112 1113 // +checklocks:e.mu 1114 func (e *endpoint) handleReset(s *segment) (ok bool, err tcpip.Error) { 1115 if e.rcv.acceptable(s.sequenceNumber, 0) { 1116 // RFC 793, page 37 states that "in all states 1117 // except SYN-SENT, all reset (RST) segments are 1118 // validated by checking their SEQ-fields." So 1119 // we only process it if it's acceptable. 1120 switch e.EndpointState() { 1121 // In case of a RST in CLOSE-WAIT linux moves 1122 // the socket to closed state with an error set 1123 // to indicate EPIPE. 1124 // 1125 // Technically this seems to be at odds w/ RFC. 1126 // As per https://tools.ietf.org/html/rfc793#section-2.7 1127 // page 69 the behavior for a segment arriving 1128 // w/ RST bit set in CLOSE-WAIT is inlined below. 1129 // 1130 // ESTABLISHED 1131 // FIN-WAIT-1 1132 // FIN-WAIT-2 1133 // CLOSE-WAIT 1134 1135 // If the RST bit is set then, any outstanding RECEIVEs and 1136 // SEND should receive "reset" responses. All segment queues 1137 // should be flushed. Users should also receive an unsolicited 1138 // general "connection reset" signal. Enter the CLOSED state, 1139 // delete the TCB, and return. 1140 case StateCloseWait: 1141 e.transitionToStateCloseLocked() 1142 e.hardError = &tcpip.ErrAborted{} 1143 return false, nil 1144 default: 1145 // RFC 793, page 37 states that "in all states 1146 // except SYN-SENT, all reset (RST) segments are 1147 // validated by checking their SEQ-fields." So 1148 // we only process it if it's acceptable. 1149 1150 // Notify protocol goroutine. This is required when 1151 // handleSegment is invoked from the processor goroutine 1152 // rather than the worker goroutine. 1153 return false, &tcpip.ErrConnectionReset{} 1154 } 1155 } 1156 return true, nil 1157 } 1158 1159 // handleSegments processes all inbound segments. 1160 // 1161 // +checklocks:e.mu 1162 // +checklocksalias:e.snd.ep.mu=e.mu 1163 func (e *endpoint) handleSegmentsLocked() tcpip.Error { 1164 sndUna := e.snd.SndUna 1165 for i := 0; i < maxSegmentsPerWake; i++ { 1166 if state := e.EndpointState(); state.closed() || state == StateTimeWait || state == StateError { 1167 return nil 1168 } 1169 s := e.segmentQueue.dequeue() 1170 if s == nil { 1171 break 1172 } 1173 cont, err := e.handleSegmentLocked(s) 1174 s.DecRef() 1175 if err != nil { 1176 return err 1177 } 1178 if !cont { 1179 return nil 1180 } 1181 } 1182 1183 // The remote ACK-ing at least 1 byte is an indication that we have a 1184 // full-duplex connection to the remote as the only way we will receive an 1185 // ACK is if the remote received data that we previously sent. 1186 // 1187 // As of writing, Linux seems to only confirm a route as reachable when 1188 // forward progress is made which is indicated by an ACK that removes data 1189 // from the retransmit queue, i.e. sender makes forward progress. 1190 if sndUna.LessThan(e.snd.SndUna) { 1191 e.route.ConfirmReachable() 1192 } 1193 1194 // Send an ACK for all processed packets if needed. 1195 if e.rcv.RcvNxt != e.snd.MaxSentAck { 1196 e.snd.sendAck() 1197 } 1198 1199 e.resetKeepaliveTimer(true /* receivedData */) 1200 1201 return nil 1202 } 1203 1204 // +checklocks:e.mu 1205 func (e *endpoint) probeSegmentLocked() { 1206 if fn := e.probe; fn != nil { 1207 var state stack.TCPEndpointState 1208 e.completeStateLocked(&state) 1209 fn(&state) 1210 } 1211 } 1212 1213 // handleSegment handles a given segment and notifies the worker goroutine if 1214 // if the connection should be terminated. 1215 // 1216 // +checklocks:e.mu 1217 // +checklocksalias:e.rcv.ep.mu=e.mu 1218 // +checklocksalias:e.snd.ep.mu=e.mu 1219 func (e *endpoint) handleSegmentLocked(s *segment) (cont bool, err tcpip.Error) { 1220 // Invoke the tcp probe if installed. The tcp probe function will update 1221 // the TCPEndpointState after the segment is processed. 1222 defer e.probeSegmentLocked() 1223 1224 if s.flags.Contains(header.TCPFlagRst) { 1225 if ok, err := e.handleReset(s); !ok { 1226 return false, err 1227 } 1228 } else if s.flags.Contains(header.TCPFlagSyn) { 1229 // See: https://tools.ietf.org/html/rfc5961#section-4.1 1230 // 1) If the SYN bit is set, irrespective of the sequence number, TCP 1231 // MUST send an ACK (also referred to as challenge ACK) to the remote 1232 // peer: 1233 // 1234 // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK> 1235 // 1236 // After sending the acknowledgment, TCP MUST drop the unacceptable 1237 // segment and stop processing further. 1238 // 1239 // By sending an ACK, the remote peer is challenged to confirm the loss 1240 // of the previous connection and the request to start a new connection. 1241 // A legitimate peer, after restart, would not have a TCB in the 1242 // synchronized state. Thus, when the ACK arrives, the peer should send 1243 // a RST segment back with the sequence number derived from the ACK 1244 // field that caused the RST. 1245 1246 // This RST will confirm that the remote peer has indeed closed the 1247 // previous connection. Upon receipt of a valid RST, the local TCP 1248 // endpoint MUST terminate its connection. The local TCP endpoint 1249 // should then rely on SYN retransmission from the remote end to 1250 // re-establish the connection. 1251 e.snd.maybeSendOutOfWindowAck(s) 1252 } else if s.flags.Contains(header.TCPFlagAck) { 1253 // Patch the window size in the segment according to the 1254 // send window scale. 1255 s.window <<= e.snd.SndWndScale 1256 1257 // RFC 793, page 41 states that "once in the ESTABLISHED 1258 // state all segments must carry current acknowledgment 1259 // information." 1260 drop, err := e.rcv.handleRcvdSegment(s) 1261 if err != nil { 1262 return false, err 1263 } 1264 if drop { 1265 return true, nil 1266 } 1267 1268 // Now check if the received segment has caused us to transition 1269 // to a CLOSED state, if yes then terminate processing and do 1270 // not invoke the sender. 1271 state := e.EndpointState() 1272 if state == StateClose { 1273 // When we get into StateClose while processing from the queue, 1274 // return immediately and let the protocolMainloop handle it. 1275 // 1276 // We can reach StateClose only while processing a previous segment 1277 // or a notification from the protocolMainLoop (caller goroutine). 1278 // This means that with this return, the segment dequeue below can 1279 // never occur on a closed endpoint. 1280 return false, nil 1281 } 1282 1283 e.snd.handleRcvdSegment(s) 1284 } 1285 1286 return true, nil 1287 } 1288 1289 // keepaliveTimerExpired is called when the keepaliveTimer fires. We send TCP 1290 // keepalive packets periodically when the connection is idle. If we don't hear 1291 // from the other side after a number of tries, we terminate the connection. 1292 // +checklocks:e.mu 1293 // +checklocksalias:e.snd.ep.mu=e.mu 1294 func (e *endpoint) keepaliveTimerExpired() tcpip.Error { 1295 userTimeout := e.userTimeout 1296 1297 e.keepalive.Lock() 1298 if !e.SocketOptions().GetKeepAlive() || e.keepalive.timer.isZero() || !e.keepalive.timer.checkExpiration() { 1299 e.keepalive.Unlock() 1300 return nil 1301 } 1302 1303 // If a userTimeout is set then abort the connection if it is 1304 // exceeded. 1305 if userTimeout != 0 && e.stack.Clock().NowMonotonic().Sub(e.rcv.lastRcvdAckTime) >= userTimeout && e.keepalive.unacked > 0 { 1306 e.keepalive.Unlock() 1307 e.stack.Stats().TCP.EstablishedTimedout.Increment() 1308 return &tcpip.ErrTimeout{} 1309 } 1310 1311 if e.keepalive.unacked >= e.keepalive.count { 1312 e.keepalive.Unlock() 1313 e.stack.Stats().TCP.EstablishedTimedout.Increment() 1314 return &tcpip.ErrTimeout{} 1315 } 1316 1317 // RFC1122 4.2.3.6: TCP keepalive is a dataless ACK with 1318 // seg.seq = snd.nxt-1. 1319 e.keepalive.unacked++ 1320 e.keepalive.Unlock() 1321 e.snd.sendEmptySegment(header.TCPFlagAck, e.snd.SndNxt-1) 1322 e.resetKeepaliveTimer(false) 1323 return nil 1324 } 1325 1326 // resetKeepaliveTimer restarts or stops the keepalive timer, depending on 1327 // whether it is enabled for this endpoint. 1328 func (e *endpoint) resetKeepaliveTimer(receivedData bool) { 1329 e.keepalive.Lock() 1330 defer e.keepalive.Unlock() 1331 if e.keepalive.timer.isZero() { 1332 if state := e.EndpointState(); !state.closed() { 1333 panic(fmt.Sprintf("Unexpected state when the keepalive time is cleaned up, got %s, want %s or %s", state, StateClose, StateError)) 1334 } 1335 return 1336 } 1337 if receivedData { 1338 e.keepalive.unacked = 0 1339 } 1340 // Start the keepalive timer IFF it's enabled and there is no pending 1341 // data to send. 1342 if !e.SocketOptions().GetKeepAlive() || e.snd == nil || e.snd.SndUna != e.snd.SndNxt { 1343 e.keepalive.timer.disable() 1344 return 1345 } 1346 if e.keepalive.unacked > 0 { 1347 e.keepalive.timer.enable(e.keepalive.interval) 1348 } else { 1349 e.keepalive.timer.enable(e.keepalive.idle) 1350 } 1351 } 1352 1353 // disableKeepaliveTimer stops the keepalive timer. 1354 func (e *endpoint) disableKeepaliveTimer() { 1355 e.keepalive.Lock() 1356 e.keepalive.timer.disable() 1357 e.keepalive.Unlock() 1358 } 1359 1360 // finWait2TimerExpired is called when the FIN-WAIT-2 timeout is hit 1361 // and the peer hasn't sent us a FIN. 1362 func (e *endpoint) finWait2TimerExpired() { 1363 e.mu.Lock() 1364 e.transitionToStateCloseLocked() 1365 e.mu.Unlock() 1366 e.drainClosingSegmentQueue() 1367 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 1368 } 1369 1370 // +checklocks:e.mu 1371 func (e *endpoint) handshakeFailed(err tcpip.Error) { 1372 e.lastErrorMu.Lock() 1373 e.lastError = err 1374 e.lastErrorMu.Unlock() 1375 // handshakeFailed is also called from startHandshake when a listener 1376 // transitions out of Listen state by the time the SYN is processed. In 1377 // such cases the handshake is never initialized and the newly created 1378 // endpoint is closed right away. 1379 if e.h != nil && e.h.retransmitTimer != nil { 1380 e.h.retransmitTimer.stop() 1381 } 1382 e.hardError = err 1383 e.cleanupLocked() 1384 e.setEndpointState(StateError) 1385 } 1386 1387 // handleTimeWaitSegments processes segments received during TIME_WAIT 1388 // state. 1389 // +checklocks:e.mu 1390 // +checklocksalias:e.rcv.ep.mu=e.mu 1391 func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()) { 1392 for i := 0; i < maxSegmentsPerWake; i++ { 1393 s := e.segmentQueue.dequeue() 1394 if s == nil { 1395 break 1396 } 1397 extTW, newSyn := e.rcv.handleTimeWaitSegment(s) 1398 if newSyn { 1399 info := e.TransportEndpointInfo 1400 newID := info.ID 1401 newID.RemoteAddress = tcpip.Address{} 1402 newID.RemotePort = 0 1403 netProtos := []tcpip.NetworkProtocolNumber{info.NetProto} 1404 // If the local address is an IPv4 address then also 1405 // look for IPv6 dual stack endpoints that might be 1406 // listening on the local address. 1407 if newID.LocalAddress.To4() != (tcpip.Address{}) { 1408 netProtos = []tcpip.NetworkProtocolNumber{header.IPv4ProtocolNumber, header.IPv6ProtocolNumber} 1409 } 1410 for _, netProto := range netProtos { 1411 if listenEP := e.stack.FindTransportEndpoint(netProto, info.TransProto, newID, s.pkt.NICID); listenEP != nil { 1412 tcpEP := listenEP.(*endpoint) 1413 if EndpointState(tcpEP.State()) == StateListen { 1414 reuseTW = func() { 1415 if !tcpEP.enqueueSegment(s) { 1416 return 1417 } 1418 tcpEP.notifyProcessor() 1419 s.DecRef() 1420 } 1421 // We explicitly do not DecRef the segment as it's still valid and 1422 // being reflected to a listening endpoint. 1423 return false, reuseTW 1424 } 1425 } 1426 } 1427 } 1428 if extTW { 1429 extendTimeWait = true 1430 } 1431 s.DecRef() 1432 } 1433 return extendTimeWait, nil 1434 } 1435 1436 // +checklocks:e.mu 1437 func (e *endpoint) getTimeWaitDuration() time.Duration { 1438 timeWaitDuration := DefaultTCPTimeWaitTimeout 1439 1440 // Get the stack wide configuration. 1441 var tcpTW tcpip.TCPTimeWaitTimeoutOption 1442 if err := e.stack.TransportProtocolOption(ProtocolNumber, &tcpTW); err == nil { 1443 timeWaitDuration = time.Duration(tcpTW) 1444 } 1445 return timeWaitDuration 1446 } 1447 1448 // timeWaitTimerExpired is called when an endpoint completes the required time 1449 // (typically 2 * MSL unless configured to something else at a stack level) in 1450 // TIME-WAIT state. 1451 func (e *endpoint) timeWaitTimerExpired() { 1452 e.mu.Lock() 1453 if e.EndpointState() != StateTimeWait { 1454 e.mu.Unlock() 1455 return 1456 } 1457 e.transitionToStateCloseLocked() 1458 e.mu.Unlock() 1459 e.drainClosingSegmentQueue() 1460 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 1461 } 1462 1463 // notifyProcessor queues this endpoint for processing to its TCP processor. 1464 func (e *endpoint) notifyProcessor() { 1465 // We use TryLock here to avoid deadlocks in cases where a listening endpoint that is being 1466 // closed tries to abort half completed connections which in turn try to queue any segments 1467 // queued to that endpoint back to the same listening endpoint (because it may have got 1468 // segments that matched its id but were either a RST or a new SYN which must be handled 1469 // by a listening endpoint). In such cases the Close() on the listening endpoint will handle 1470 // any queued segments after it releases the lock. 1471 if !e.mu.TryLock() { 1472 return 1473 } 1474 processor := e.protocol.dispatcher.selectProcessor(e.ID) 1475 e.mu.Unlock() 1476 processor.queueEndpoint(e) 1477 }