inet.af/netstack@v0.0.0-20220214151720-7585b01ddccf/tcpip/transport/tcp/connect.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "encoding/binary" 19 "math" 20 "time" 21 22 "inet.af/netstack/sleep" 23 "inet.af/netstack/sync" 24 "inet.af/netstack/tcpip" 25 "inet.af/netstack/tcpip/buffer" 26 "inet.af/netstack/tcpip/hash/jenkins" 27 "inet.af/netstack/tcpip/header" 28 "inet.af/netstack/tcpip/seqnum" 29 "inet.af/netstack/tcpip/stack" 30 "inet.af/netstack/waiter" 31 ) 32 33 // InitialRTO is the initial retransmission timeout. 34 // https://github.com/torvalds/linux/blob/7c636d4d20f/include/net/tcp.h#L142 35 const InitialRTO = time.Second 36 37 // maxSegmentsPerWake is the maximum number of segments to process in the main 38 // protocol goroutine per wake-up. Yielding [after this number of segments are 39 // processed] allows other events to be processed as well (e.g., timeouts, 40 // resets, etc.). 41 const maxSegmentsPerWake = 100 42 43 type handshakeState int 44 45 // The following are the possible states of the TCP connection during a 3-way 46 // handshake. A depiction of the states and transitions can be found in RFC 793, 47 // page 23. 48 const ( 49 handshakeSynSent handshakeState = iota 50 handshakeSynRcvd 51 handshakeCompleted 52 ) 53 54 const ( 55 // Maximum space available for options. 56 maxOptionSize = 40 57 ) 58 59 // handshake holds the state used during a TCP 3-way handshake. 60 // 61 // NOTE: handshake.ep.mu is held during handshake processing. It is released if 62 // we are going to block and reacquired when we start processing an event. 63 type handshake struct { 64 ep *endpoint 65 listenEP *endpoint 66 state handshakeState 67 active bool 68 flags header.TCPFlags 69 ackNum seqnum.Value 70 71 // iss is the initial send sequence number, as defined in RFC 793. 72 iss seqnum.Value 73 74 // rcvWnd is the receive window, as defined in RFC 793. 75 rcvWnd seqnum.Size 76 77 // sndWnd is the send window, as defined in RFC 793. 78 sndWnd seqnum.Size 79 80 // mss is the maximum segment size received from the peer. 81 mss uint16 82 83 // sndWndScale is the send window scale, as defined in RFC 1323. A 84 // negative value means no scaling is supported by the peer. 85 sndWndScale int 86 87 // rcvWndScale is the receive window scale, as defined in RFC 1323. 88 rcvWndScale int 89 90 // startTime is the time at which the first SYN/SYN-ACK was sent. 91 startTime tcpip.MonotonicTime 92 93 // deferAccept if non-zero will drop the final ACK for a passive 94 // handshake till an ACK segment with data is received or the timeout is 95 // hit. 96 deferAccept time.Duration 97 98 // acked is true if the the final ACK for a 3-way handshake has 99 // been received. This is required to stop retransmitting the 100 // original SYN-ACK when deferAccept is enabled. 101 acked bool 102 103 // sendSYNOpts is the cached values for the SYN options to be sent. 104 sendSYNOpts header.TCPSynOptions 105 106 // sampleRTTWithTSOnly is true when the segment was retransmitted or we can't 107 // tell; then RTT can only be sampled when the incoming segment has timestamp 108 // options enabled. 109 sampleRTTWithTSOnly bool 110 } 111 112 func (e *endpoint) newHandshake() *handshake { 113 h := &handshake{ 114 ep: e, 115 active: true, 116 rcvWnd: seqnum.Size(e.initialReceiveWindow()), 117 rcvWndScale: e.rcvWndScaleForHandshake(), 118 } 119 h.resetState() 120 // Store reference to handshake state in endpoint. 121 e.h = h 122 // By the time handshake is created, e.ID is already initialized. 123 e.TSOffset = e.protocol.tsOffset(e.ID.LocalAddress, e.ID.RemoteAddress) 124 return h 125 } 126 127 func (e *endpoint) newPassiveHandshake(isn, irs seqnum.Value, opts header.TCPSynOptions, deferAccept time.Duration) *handshake { 128 h := e.newHandshake() 129 h.resetToSynRcvd(isn, irs, opts, deferAccept) 130 return h 131 } 132 133 // FindWndScale determines the window scale to use for the given maximum window 134 // size. 135 func FindWndScale(wnd seqnum.Size) int { 136 if wnd < 0x10000 { 137 return 0 138 } 139 140 max := seqnum.Size(math.MaxUint16) 141 s := 0 142 for wnd > max && s < header.MaxWndScale { 143 s++ 144 max <<= 1 145 } 146 147 return s 148 } 149 150 // resetState resets the state of the handshake object such that it becomes 151 // ready for a new 3-way handshake. 152 func (h *handshake) resetState() { 153 h.state = handshakeSynSent 154 h.flags = header.TCPFlagSyn 155 h.ackNum = 0 156 h.mss = 0 157 h.iss = generateSecureISN(h.ep.TransportEndpointInfo.ID, h.ep.stack.Clock(), h.ep.protocol.seqnumSecret) 158 } 159 160 // generateSecureISN generates a secure Initial Sequence number based on the 161 // recommendation here https://tools.ietf.org/html/rfc6528#page-3. 162 func generateSecureISN(id stack.TransportEndpointID, clock tcpip.Clock, seed uint32) seqnum.Value { 163 isnHasher := jenkins.Sum32(seed) 164 // Per hash.Hash.Writer: 165 // 166 // It never returns an error. 167 _, _ = isnHasher.Write([]byte(id.LocalAddress)) 168 _, _ = isnHasher.Write([]byte(id.RemoteAddress)) 169 portBuf := make([]byte, 2) 170 binary.LittleEndian.PutUint16(portBuf, id.LocalPort) 171 _, _ = isnHasher.Write(portBuf) 172 binary.LittleEndian.PutUint16(portBuf, id.RemotePort) 173 _, _ = isnHasher.Write(portBuf) 174 // The time period here is 64ns. This is similar to what linux uses 175 // generate a sequence number that overlaps less than one 176 // time per MSL (2 minutes). 177 // 178 // A 64ns clock ticks 10^9/64 = 15625000) times in a second. 179 // To wrap the whole 32 bit space would require 180 // 2^32/1562500 ~ 274 seconds. 181 // 182 // Which sort of guarantees that we won't reuse the ISN for a new 183 // connection for the same tuple for at least 274s. 184 isn := isnHasher.Sum32() + uint32(clock.NowMonotonic().Sub(tcpip.MonotonicTime{}).Nanoseconds()>>6) 185 return seqnum.Value(isn) 186 } 187 188 // effectiveRcvWndScale returns the effective receive window scale to be used. 189 // If the peer doesn't support window scaling, the effective rcv wnd scale is 190 // zero; otherwise it's the value calculated based on the initial rcv wnd. 191 func (h *handshake) effectiveRcvWndScale() uint8 { 192 if h.sndWndScale < 0 { 193 return 0 194 } 195 return uint8(h.rcvWndScale) 196 } 197 198 // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD 199 // state. 200 func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts header.TCPSynOptions, deferAccept time.Duration) { 201 h.active = false 202 h.state = handshakeSynRcvd 203 h.flags = header.TCPFlagSyn | header.TCPFlagAck 204 h.iss = iss 205 h.ackNum = irs + 1 206 h.mss = opts.MSS 207 h.sndWndScale = opts.WS 208 h.deferAccept = deferAccept 209 h.ep.setEndpointState(StateSynRecv) 210 } 211 212 // checkAck checks if the ACK number, if present, of a segment received during 213 // a TCP 3-way handshake is valid. If it's not, a RST segment is sent back in 214 // response. 215 func (h *handshake) checkAck(s *segment) bool { 216 if s.flags.Contains(header.TCPFlagAck) && s.ackNumber != h.iss+1 { 217 // RFC 793, page 36, states that a reset must be generated when 218 // the connection is in any non-synchronized state and an 219 // incoming segment acknowledges something not yet sent. The 220 // connection remains in the same state. 221 ack := s.sequenceNumber.Add(s.logicalLen()) 222 h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, s.ackNumber, ack, 0) 223 return false 224 } 225 226 return true 227 } 228 229 // synSentState handles a segment received when the TCP 3-way handshake is in 230 // the SYN-SENT state. 231 func (h *handshake) synSentState(s *segment) tcpip.Error { 232 // RFC 793, page 37, states that in the SYN-SENT state, a reset is 233 // acceptable if the ack field acknowledges the SYN. 234 if s.flags.Contains(header.TCPFlagRst) { 235 if s.flags.Contains(header.TCPFlagAck) && s.ackNumber == h.iss+1 { 236 // RFC 793, page 67, states that "If the RST bit is set [and] If the ACK 237 // was acceptable then signal the user "error: connection reset", drop 238 // the segment, enter CLOSED state, delete TCB, and return." 239 h.ep.workerCleanup = true 240 // Although the RFC above calls out ECONNRESET, Linux actually returns 241 // ECONNREFUSED here so we do as well. 242 return &tcpip.ErrConnectionRefused{} 243 } 244 return nil 245 } 246 247 if !h.checkAck(s) { 248 return nil 249 } 250 251 // We are in the SYN-SENT state. We only care about segments that have 252 // the SYN flag. 253 if !s.flags.Contains(header.TCPFlagSyn) { 254 return nil 255 } 256 257 // Parse the SYN options. 258 rcvSynOpts := parseSynSegmentOptions(s) 259 260 // Remember if the Timestamp option was negotiated. 261 h.ep.maybeEnableTimestamp(rcvSynOpts) 262 263 // Remember if the SACKPermitted option was negotiated. 264 h.ep.maybeEnableSACKPermitted(rcvSynOpts) 265 266 // Remember the sequence we'll ack from now on. 267 h.ackNum = s.sequenceNumber + 1 268 h.flags |= header.TCPFlagAck 269 h.mss = rcvSynOpts.MSS 270 h.sndWndScale = rcvSynOpts.WS 271 272 // If this is a SYN ACK response, we only need to acknowledge the SYN 273 // and the handshake is completed. 274 if s.flags.Contains(header.TCPFlagAck) { 275 h.state = handshakeCompleted 276 h.transitionToStateEstablishedLocked(s) 277 278 h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale()) 279 return nil 280 } 281 282 // A SYN segment was received, but no ACK in it. We acknowledge the SYN 283 // but resend our own SYN and wait for it to be acknowledged in the 284 // SYN-RCVD state. 285 h.state = handshakeSynRcvd 286 ttl := h.ep.ttl 287 amss := h.ep.amss 288 h.ep.setEndpointState(StateSynRecv) 289 synOpts := header.TCPSynOptions{ 290 WS: int(h.effectiveRcvWndScale()), 291 TS: rcvSynOpts.TS, 292 TSVal: h.ep.tsValNow(), 293 TSEcr: h.ep.recentTimestamp(), 294 295 // We only send SACKPermitted if the other side indicated it 296 // permits SACK. This is not explicitly defined in the RFC but 297 // this is the behaviour implemented by Linux. 298 SACKPermitted: rcvSynOpts.SACKPermitted, 299 MSS: amss, 300 } 301 if ttl == 0 { 302 ttl = h.ep.route.DefaultTTL() 303 } 304 h.ep.sendSynTCP(h.ep.route, tcpFields{ 305 id: h.ep.TransportEndpointInfo.ID, 306 ttl: ttl, 307 tos: h.ep.sendTOS, 308 flags: h.flags, 309 seq: h.iss, 310 ack: h.ackNum, 311 rcvWnd: h.rcvWnd, 312 }, synOpts) 313 return nil 314 } 315 316 // synRcvdState handles a segment received when the TCP 3-way handshake is in 317 // the SYN-RCVD state. 318 func (h *handshake) synRcvdState(s *segment) tcpip.Error { 319 if s.flags.Contains(header.TCPFlagRst) { 320 // RFC 793, page 37, states that in the SYN-RCVD state, a reset 321 // is acceptable if the sequence number is in the window. 322 if s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) { 323 return &tcpip.ErrConnectionRefused{} 324 } 325 return nil 326 } 327 328 if !h.checkAck(s) { 329 return nil 330 } 331 332 // RFC 793, Section 3.9, page 69, states that in the SYN-RCVD state, a 333 // sequence number outside of the window causes an ACK with the proper seq 334 // number and "After sending the acknowledgment, drop the unacceptable 335 // segment and return." 336 if !s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) { 337 if h.ep.allowOutOfWindowAck() { 338 h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd) 339 } 340 return nil 341 } 342 343 if s.flags.Contains(header.TCPFlagSyn) && s.sequenceNumber != h.ackNum-1 { 344 // We received two SYN segments with different sequence 345 // numbers, so we reset this and restart the whole 346 // process, except that we don't reset the timer. 347 ack := s.sequenceNumber.Add(s.logicalLen()) 348 seq := seqnum.Value(0) 349 if s.flags.Contains(header.TCPFlagAck) { 350 seq = s.ackNumber 351 } 352 h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0) 353 354 if !h.active { 355 return &tcpip.ErrInvalidEndpointState{} 356 } 357 358 h.resetState() 359 synOpts := header.TCPSynOptions{ 360 WS: h.rcvWndScale, 361 TS: h.ep.SendTSOk, 362 TSVal: h.ep.tsValNow(), 363 TSEcr: h.ep.recentTimestamp(), 364 SACKPermitted: h.ep.SACKPermitted, 365 MSS: h.ep.amss, 366 } 367 h.ep.sendSynTCP(h.ep.route, tcpFields{ 368 id: h.ep.TransportEndpointInfo.ID, 369 ttl: h.ep.ttl, 370 tos: h.ep.sendTOS, 371 flags: h.flags, 372 seq: h.iss, 373 ack: h.ackNum, 374 rcvWnd: h.rcvWnd, 375 }, synOpts) 376 return nil 377 } 378 379 // We have previously received (and acknowledged) the peer's SYN. If the 380 // peer acknowledges our SYN, the handshake is completed. 381 if s.flags.Contains(header.TCPFlagAck) { 382 // If deferAccept is not zero and this is a bare ACK and the 383 // timeout is not hit then drop the ACK. 384 if h.deferAccept != 0 && s.data.Size() == 0 && h.ep.stack.Clock().NowMonotonic().Sub(h.startTime) < h.deferAccept { 385 h.acked = true 386 h.ep.stack.Stats().DroppedPackets.Increment() 387 return nil 388 } 389 390 // If the timestamp option is negotiated and the segment does 391 // not carry a timestamp option then the segment must be dropped 392 // as per https://tools.ietf.org/html/rfc7323#section-3.2. 393 if h.ep.SendTSOk && !s.parsedOptions.TS { 394 h.ep.stack.Stats().DroppedPackets.Increment() 395 return nil 396 } 397 398 // Drop the ACK if the accept queue is full. 399 // https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_ipv4.c#L1523 400 // We could abort the connection as well with a tunable as in 401 // https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_minisocks.c#L788 402 if listenEP := h.listenEP; listenEP != nil && listenEP.acceptQueueIsFull() { 403 listenEP.stack.Stats().DroppedPackets.Increment() 404 return nil 405 } 406 407 // Update timestamp if required. See RFC7323, section-4.3. 408 if h.ep.SendTSOk && s.parsedOptions.TS { 409 h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber) 410 } 411 412 h.state = handshakeCompleted 413 414 h.transitionToStateEstablishedLocked(s) 415 416 // Requeue the segment if the ACK completing the handshake has more info 417 // to be procesed by the newly established endpoint. 418 if (s.flags.Contains(header.TCPFlagFin) || s.data.Size() > 0) && h.ep.enqueueSegment(s) { 419 s.incRef() 420 h.ep.newSegmentWaker.Assert() 421 } 422 return nil 423 } 424 425 return nil 426 } 427 428 func (h *handshake) handleSegment(s *segment) tcpip.Error { 429 h.sndWnd = s.window 430 if !s.flags.Contains(header.TCPFlagSyn) && h.sndWndScale > 0 { 431 h.sndWnd <<= uint8(h.sndWndScale) 432 } 433 434 switch h.state { 435 case handshakeSynRcvd: 436 return h.synRcvdState(s) 437 case handshakeSynSent: 438 return h.synSentState(s) 439 } 440 return nil 441 } 442 443 // processSegments goes through the segment queue and processes up to 444 // maxSegmentsPerWake (if they're available). 445 func (h *handshake) processSegments() tcpip.Error { 446 for i := 0; i < maxSegmentsPerWake; i++ { 447 s := h.ep.segmentQueue.dequeue() 448 if s == nil { 449 return nil 450 } 451 452 err := h.handleSegment(s) 453 s.decRef() 454 if err != nil { 455 return err 456 } 457 458 // We stop processing packets once the handshake is completed, 459 // otherwise we may process packets meant to be processed by 460 // the main protocol goroutine. 461 if h.state == handshakeCompleted { 462 break 463 } 464 } 465 466 // If the queue is not empty, make sure we'll wake up in the next 467 // iteration. 468 if !h.ep.segmentQueue.empty() { 469 h.ep.newSegmentWaker.Assert() 470 } 471 472 return nil 473 } 474 475 // start sends the first SYN/SYN-ACK. It does not block, even if link address 476 // resolution is required. 477 func (h *handshake) start() { 478 h.startTime = h.ep.stack.Clock().NowMonotonic() 479 h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route) 480 var sackEnabled tcpip.TCPSACKEnabled 481 if err := h.ep.stack.TransportProtocolOption(ProtocolNumber, &sackEnabled); err != nil { 482 // If stack returned an error when checking for SACKEnabled 483 // status then just default to switching off SACK negotiation. 484 sackEnabled = false 485 } 486 487 synOpts := header.TCPSynOptions{ 488 WS: h.rcvWndScale, 489 TS: true, 490 TSVal: h.ep.tsValNow(), 491 TSEcr: h.ep.recentTimestamp(), 492 SACKPermitted: bool(sackEnabled), 493 MSS: h.ep.amss, 494 } 495 496 // start() is also called in a listen context so we want to make sure we only 497 // send the TS/SACK option when we received the TS/SACK in the initial SYN. 498 if h.state == handshakeSynRcvd { 499 synOpts.TS = h.ep.SendTSOk 500 synOpts.SACKPermitted = h.ep.SACKPermitted && bool(sackEnabled) 501 if h.sndWndScale < 0 { 502 // Disable window scaling if the peer did not send us 503 // the window scaling option. 504 synOpts.WS = -1 505 } 506 } 507 508 h.sendSYNOpts = synOpts 509 h.ep.sendSynTCP(h.ep.route, tcpFields{ 510 id: h.ep.TransportEndpointInfo.ID, 511 ttl: h.ep.ttl, 512 tos: h.ep.sendTOS, 513 flags: h.flags, 514 seq: h.iss, 515 ack: h.ackNum, 516 rcvWnd: h.rcvWnd, 517 }, synOpts) 518 } 519 520 // complete completes the TCP 3-way handshake initiated by h.start(). 521 // +checklocks:h.ep.mu 522 func (h *handshake) complete() tcpip.Error { 523 // Set up the wakers. 524 var s sleep.Sleeper 525 resendWaker := sleep.Waker{} 526 s.AddWaker(&resendWaker) 527 s.AddWaker(&h.ep.notificationWaker) 528 s.AddWaker(&h.ep.newSegmentWaker) 529 defer s.Done() 530 531 // Initialize the resend timer. 532 timer, err := newBackoffTimer(h.ep.stack.Clock(), InitialRTO, MaxRTO, resendWaker.Assert) 533 if err != nil { 534 return err 535 } 536 defer timer.stop() 537 for h.state != handshakeCompleted { 538 // Unlock before blocking, and reacquire again afterwards (h.ep.mu is held 539 // throughout handshake processing). 540 h.ep.mu.Unlock() 541 w := s.Fetch(true /* block */) 542 h.ep.mu.Lock() 543 switch w { 544 case &resendWaker: 545 if err := timer.reset(); err != nil { 546 return err 547 } 548 // Resend the SYN/SYN-ACK only if the following conditions hold. 549 // - It's an active handshake (deferAccept does not apply) 550 // - It's a passive handshake and we have not yet got the final-ACK. 551 // - It's a passive handshake and we got an ACK but deferAccept is 552 // enabled and we are now past the deferAccept duration. 553 // The last is required to provide a way for the peer to complete 554 // the connection with another ACK or data (as ACKs are never 555 // retransmitted on their own). 556 if h.active || !h.acked || h.deferAccept != 0 && h.ep.stack.Clock().NowMonotonic().Sub(h.startTime) > h.deferAccept { 557 h.ep.sendSynTCP(h.ep.route, tcpFields{ 558 id: h.ep.TransportEndpointInfo.ID, 559 ttl: h.ep.ttl, 560 tos: h.ep.sendTOS, 561 flags: h.flags, 562 seq: h.iss, 563 ack: h.ackNum, 564 rcvWnd: h.rcvWnd, 565 }, h.sendSYNOpts) 566 // If we have ever retransmitted the SYN-ACK or 567 // SYN segment, we should only measure RTT if 568 // TS option is present. 569 h.sampleRTTWithTSOnly = true 570 } 571 572 case &h.ep.notificationWaker: 573 n := h.ep.fetchNotifications() 574 if (n¬ifyClose)|(n¬ifyAbort) != 0 { 575 return &tcpip.ErrAborted{} 576 } 577 if n¬ifyShutdown != 0 { 578 return &tcpip.ErrConnectionReset{} 579 } 580 if n¬ifyDrain != 0 { 581 for !h.ep.segmentQueue.empty() { 582 s := h.ep.segmentQueue.dequeue() 583 err := h.handleSegment(s) 584 s.decRef() 585 if err != nil { 586 return err 587 } 588 if h.state == handshakeCompleted { 589 return nil 590 } 591 } 592 close(h.ep.drainDone) 593 h.ep.mu.Unlock() 594 <-h.ep.undrain 595 h.ep.mu.Lock() 596 } 597 // Check for any ICMP errors notified to us. 598 if n¬ifyError != 0 { 599 if err := h.ep.lastErrorLocked(); err != nil { 600 return err 601 } 602 // Flag the handshake failure as aborted if the lastError is 603 // cleared because of a socket layer call. 604 return &tcpip.ErrConnectionAborted{} 605 } 606 case &h.ep.newSegmentWaker: 607 if err := h.processSegments(); err != nil { 608 return err 609 } 610 } 611 } 612 613 return nil 614 } 615 616 // transitionToStateEstablisedLocked transitions the endpoint of the handshake 617 // to an established state given the last segment received from peer. It also 618 // initializes sender/receiver. 619 func (h *handshake) transitionToStateEstablishedLocked(s *segment) { 620 // Transfer handshake state to TCP connection. We disable 621 // receive window scaling if the peer doesn't support it 622 // (indicated by a negative send window scale). 623 h.ep.snd = newSender(h.ep, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale) 624 625 now := h.ep.stack.Clock().NowMonotonic() 626 627 var rtt time.Duration 628 if h.ep.SendTSOk && s.parsedOptions.TSEcr != 0 { 629 rtt = h.ep.elapsed(now, s.parsedOptions.TSEcr) 630 } 631 if !h.sampleRTTWithTSOnly && rtt == 0 { 632 rtt = now.Sub(h.startTime) 633 } 634 635 if rtt > 0 { 636 h.ep.snd.updateRTO(rtt) 637 } 638 639 h.ep.rcvQueueInfo.rcvQueueMu.Lock() 640 h.ep.rcv = newReceiver(h.ep, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale()) 641 // Bootstrap the auto tuning algorithm. Starting at zero will 642 // result in a really large receive window after the first auto 643 // tuning adjustment. 644 h.ep.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes = int(h.rcvWnd) 645 h.ep.rcvQueueInfo.rcvQueueMu.Unlock() 646 647 h.ep.setEndpointState(StateEstablished) 648 } 649 650 type backoffTimer struct { 651 timeout time.Duration 652 maxTimeout time.Duration 653 t tcpip.Timer 654 } 655 656 func newBackoffTimer(clock tcpip.Clock, timeout, maxTimeout time.Duration, f func()) (*backoffTimer, tcpip.Error) { 657 if timeout > maxTimeout { 658 return nil, &tcpip.ErrTimeout{} 659 } 660 bt := &backoffTimer{timeout: timeout, maxTimeout: maxTimeout} 661 bt.t = clock.AfterFunc(timeout, f) 662 return bt, nil 663 } 664 665 func (bt *backoffTimer) reset() tcpip.Error { 666 bt.timeout *= 2 667 if bt.timeout > bt.maxTimeout { 668 return &tcpip.ErrTimeout{} 669 } 670 bt.t.Reset(bt.timeout) 671 return nil 672 } 673 674 func (bt *backoffTimer) stop() { 675 bt.t.Stop() 676 } 677 678 func parseSynSegmentOptions(s *segment) header.TCPSynOptions { 679 synOpts := header.ParseSynOptions(s.options, s.flags.Contains(header.TCPFlagAck)) 680 if synOpts.TS { 681 s.parsedOptions.TSVal = synOpts.TSVal 682 s.parsedOptions.TSEcr = synOpts.TSEcr 683 } 684 return synOpts 685 } 686 687 var optionPool = sync.Pool{ 688 New: func() interface{} { 689 return &[maxOptionSize]byte{} 690 }, 691 } 692 693 func getOptions() []byte { 694 return (*optionPool.Get().(*[maxOptionSize]byte))[:] 695 } 696 697 func putOptions(options []byte) { 698 // Reslice to full capacity. 699 optionPool.Put(optionsToArray(options)) 700 } 701 702 func makeSynOptions(opts header.TCPSynOptions) []byte { 703 // Emulate linux option order. This is as follows: 704 // 705 // if md5: NOP NOP MD5SIG 18 md5sig(16) 706 // if mss: MSS 4 mss(2) 707 // if ts and sack_advertise: 708 // SACK 2 TIMESTAMP 2 timestamp(8) 709 // elif ts: NOP NOP TIMESTAMP 10 timestamp(8) 710 // elif sack: NOP NOP SACK 2 711 // if wscale: NOP WINDOW 3 ws(1) 712 // if sack_blocks: NOP NOP SACK ((2 + (#blocks * 8)) 713 // [for each block] start_seq(4) end_seq(4) 714 // if fastopen_cookie: 715 // if exp: EXP (4 + len(cookie)) FASTOPEN_MAGIC(2) 716 // else: FASTOPEN (2 + len(cookie)) 717 // cookie(variable) [padding to four bytes] 718 // 719 options := getOptions() 720 721 // Always encode the mss. 722 offset := header.EncodeMSSOption(uint32(opts.MSS), options) 723 724 // Special ordering is required here. If both TS and SACK are enabled, 725 // then the SACK option precedes TS, with no padding. If they are 726 // enabled individually, then we see padding before the option. 727 if opts.TS && opts.SACKPermitted { 728 offset += header.EncodeSACKPermittedOption(options[offset:]) 729 offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:]) 730 } else if opts.TS { 731 offset += header.EncodeNOP(options[offset:]) 732 offset += header.EncodeNOP(options[offset:]) 733 offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:]) 734 } else if opts.SACKPermitted { 735 offset += header.EncodeNOP(options[offset:]) 736 offset += header.EncodeNOP(options[offset:]) 737 offset += header.EncodeSACKPermittedOption(options[offset:]) 738 } 739 740 // Initialize the WS option. 741 if opts.WS >= 0 { 742 offset += header.EncodeNOP(options[offset:]) 743 offset += header.EncodeWSOption(opts.WS, options[offset:]) 744 } 745 746 // Padding to the end; note that this never apply unless we add a 747 // fastopen option, we always expect the offset to remain the same. 748 if delta := header.AddTCPOptionPadding(options, offset); delta != 0 { 749 panic("unexpected option encoding") 750 } 751 752 return options[:offset] 753 } 754 755 // tcpFields is a struct to carry different parameters required by the 756 // send*TCP variant functions below. 757 type tcpFields struct { 758 id stack.TransportEndpointID 759 ttl uint8 760 tos uint8 761 flags header.TCPFlags 762 seq seqnum.Value 763 ack seqnum.Value 764 rcvWnd seqnum.Size 765 opts []byte 766 txHash uint32 767 } 768 769 func (e *endpoint) sendSynTCP(r *stack.Route, tf tcpFields, opts header.TCPSynOptions) tcpip.Error { 770 tf.opts = makeSynOptions(opts) 771 // We ignore SYN send errors and let the callers re-attempt send. 772 if err := e.sendTCP(r, tf, buffer.VectorisedView{}, stack.GSO{}); err != nil { 773 e.stats.SendErrors.SynSendToNetworkFailed.Increment() 774 } 775 putOptions(tf.opts) 776 return nil 777 } 778 779 func (e *endpoint) sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso stack.GSO) tcpip.Error { 780 tf.txHash = e.txHash 781 if err := sendTCP(r, tf, data, gso, e.owner); err != nil { 782 e.stats.SendErrors.SegmentSendToNetworkFailed.Increment() 783 return err 784 } 785 e.stats.SegmentsSent.Increment() 786 return nil 787 } 788 789 func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso stack.GSO) { 790 optLen := len(tf.opts) 791 tcp := header.TCP(pkt.TransportHeader().Push(header.TCPMinimumSize + optLen)) 792 pkt.TransportProtocolNumber = header.TCPProtocolNumber 793 tcp.Encode(&header.TCPFields{ 794 SrcPort: tf.id.LocalPort, 795 DstPort: tf.id.RemotePort, 796 SeqNum: uint32(tf.seq), 797 AckNum: uint32(tf.ack), 798 DataOffset: uint8(header.TCPMinimumSize + optLen), 799 Flags: tf.flags, 800 WindowSize: uint16(tf.rcvWnd), 801 }) 802 copy(tcp[header.TCPMinimumSize:], tf.opts) 803 804 xsum := r.PseudoHeaderChecksum(ProtocolNumber, uint16(pkt.Size())) 805 // Only calculate the checksum if offloading isn't supported. 806 if gso.Type != stack.GSONone && gso.NeedsCsum { 807 // This is called CHECKSUM_PARTIAL in the Linux kernel. We 808 // calculate a checksum of the pseudo-header and save it in the 809 // TCP header, then the kernel calculate a checksum of the 810 // header and data and get the right sum of the TCP packet. 811 tcp.SetChecksum(xsum) 812 } else if r.RequiresTXTransportChecksum() { 813 xsum = header.ChecksumCombine(xsum, pkt.Data().AsRange().Checksum()) 814 tcp.SetChecksum(^tcp.CalculateChecksum(xsum)) 815 } 816 } 817 818 func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error { 819 // We need to shallow clone the VectorisedView here as ReadToView will 820 // split the VectorisedView and Trim underlying views as it splits. Not 821 // doing the clone here will cause the underlying views of data itself 822 // to be altered. 823 data = data.Clone(nil) 824 825 optLen := len(tf.opts) 826 if tf.rcvWnd > math.MaxUint16 { 827 tf.rcvWnd = math.MaxUint16 828 } 829 830 mss := int(gso.MSS) 831 n := (data.Size() + mss - 1) / mss 832 833 size := data.Size() 834 hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen 835 var pkts stack.PacketBufferList 836 for i := 0; i < n; i++ { 837 packetSize := mss 838 if packetSize > size { 839 packetSize = size 840 } 841 size -= packetSize 842 pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ 843 ReserveHeaderBytes: hdrSize, 844 }) 845 pkt.Hash = tf.txHash 846 pkt.Owner = owner 847 pkt.Data().ReadFromVV(&data, packetSize) 848 buildTCPHdr(r, tf, pkt, gso) 849 tf.seq = tf.seq.Add(seqnum.Size(packetSize)) 850 pkt.GSOOptions = gso 851 pkts.PushBack(pkt) 852 } 853 defer pkts.DecRef() 854 855 if tf.ttl == 0 { 856 tf.ttl = r.DefaultTTL() 857 } 858 sent, err := r.WritePackets(pkts, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos}) 859 if err != nil { 860 r.Stats().TCP.SegmentSendErrors.IncrementBy(uint64(n - sent)) 861 } 862 r.Stats().TCP.SegmentsSent.IncrementBy(uint64(sent)) 863 return err 864 } 865 866 // sendTCP sends a TCP segment with the provided options via the provided 867 // network endpoint and under the provided identity. 868 func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error { 869 optLen := len(tf.opts) 870 if tf.rcvWnd > math.MaxUint16 { 871 tf.rcvWnd = math.MaxUint16 872 } 873 874 if r.Loop()&stack.PacketLoop == 0 && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() { 875 return sendTCPBatch(r, tf, data, gso, owner) 876 } 877 878 pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ 879 ReserveHeaderBytes: header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen, 880 Data: data, 881 }) 882 defer pkt.DecRef() 883 pkt.GSOOptions = gso 884 pkt.Hash = tf.txHash 885 pkt.Owner = owner 886 buildTCPHdr(r, tf, pkt, gso) 887 888 if tf.ttl == 0 { 889 tf.ttl = r.DefaultTTL() 890 } 891 if err := r.WritePacket(stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos}, pkt); err != nil { 892 r.Stats().TCP.SegmentSendErrors.Increment() 893 return err 894 } 895 r.Stats().TCP.SegmentsSent.Increment() 896 if (tf.flags & header.TCPFlagRst) != 0 { 897 r.Stats().TCP.ResetsSent.Increment() 898 } 899 return nil 900 } 901 902 // makeOptions makes an options slice. 903 func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte { 904 options := getOptions() 905 offset := 0 906 907 // N.B. the ordering here matches the ordering used by Linux internally 908 // and described in the raw makeOptions function. We don't include 909 // unnecessary cases here (post connection.) 910 if e.SendTSOk { 911 // Embed the timestamp if timestamp has been enabled. 912 // 913 // We only use the lower 32 bits of the unix time in 914 // milliseconds. This is similar to what Linux does where it 915 // uses the lower 32 bits of the jiffies value in the tsVal 916 // field of the timestamp option. 917 // 918 // Further, RFC7323 section-5.4 recommends millisecond 919 // resolution as the lowest recommended resolution for the 920 // timestamp clock. 921 // 922 // Ref: https://tools.ietf.org/html/rfc7323#section-5.4. 923 offset += header.EncodeNOP(options[offset:]) 924 offset += header.EncodeNOP(options[offset:]) 925 offset += header.EncodeTSOption(e.tsValNow(), e.recentTimestamp(), options[offset:]) 926 } 927 if e.SACKPermitted && len(sackBlocks) > 0 { 928 offset += header.EncodeNOP(options[offset:]) 929 offset += header.EncodeNOP(options[offset:]) 930 offset += header.EncodeSACKBlocks(sackBlocks, options[offset:]) 931 } 932 933 // We expect the above to produce an aligned offset. 934 if delta := header.AddTCPOptionPadding(options, offset); delta != 0 { 935 panic("unexpected option encoding") 936 } 937 938 return options[:offset] 939 } 940 941 // sendRaw sends a TCP segment to the endpoint's peer. 942 func (e *endpoint) sendRaw(data buffer.VectorisedView, flags header.TCPFlags, seq, ack seqnum.Value, rcvWnd seqnum.Size) tcpip.Error { 943 var sackBlocks []header.SACKBlock 944 if e.EndpointState() == StateEstablished && e.rcv.pendingRcvdSegments.Len() > 0 && (flags&header.TCPFlagAck != 0) { 945 sackBlocks = e.sack.Blocks[:e.sack.NumBlocks] 946 } 947 options := e.makeOptions(sackBlocks) 948 err := e.sendTCP(e.route, tcpFields{ 949 id: e.TransportEndpointInfo.ID, 950 ttl: e.ttl, 951 tos: e.sendTOS, 952 flags: flags, 953 seq: seq, 954 ack: ack, 955 rcvWnd: rcvWnd, 956 opts: options, 957 }, data, e.gso) 958 putOptions(options) 959 return err 960 } 961 962 // Precondition: e.mu must be locked. 963 func (e *endpoint) sendData(next *segment) { 964 // Initialize the next segment to write if it's currently nil. 965 if e.snd.writeNext == nil { 966 if next == nil { 967 return 968 } 969 e.snd.writeNext = next 970 } 971 972 // Push out any new packets. 973 e.snd.sendData() 974 } 975 976 // resetConnectionLocked puts the endpoint in an error state with the given 977 // error code and sends a RST if and only if the error is not ErrConnectionReset 978 // indicating that the connection is being reset due to receiving a RST. This 979 // method must only be called from the protocol goroutine. 980 func (e *endpoint) resetConnectionLocked(err tcpip.Error) { 981 // Only send a reset if the connection is being aborted for a reason 982 // other than receiving a reset. 983 e.setEndpointState(StateError) 984 e.hardError = err 985 switch err.(type) { 986 case *tcpip.ErrConnectionReset, *tcpip.ErrTimeout: 987 default: 988 // The exact sequence number to be used for the RST is the same as the 989 // one used by Linux. We need to handle the case of window being shrunk 990 // which can cause sndNxt to be outside the acceptable window on the 991 // receiver. 992 // 993 // See: https://www.snellman.net/blog/archive/2016-02-01-tcp-rst/ for more 994 // information. 995 sndWndEnd := e.snd.SndUna.Add(e.snd.SndWnd) 996 resetSeqNum := sndWndEnd 997 if !sndWndEnd.LessThan(e.snd.SndNxt) || e.snd.SndNxt.Size(sndWndEnd) < (1<<e.snd.SndWndScale) { 998 resetSeqNum = e.snd.SndNxt 999 } 1000 e.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck|header.TCPFlagRst, resetSeqNum, e.rcv.RcvNxt, 0) 1001 } 1002 } 1003 1004 // completeWorkerLocked is called by the worker goroutine when it's about to 1005 // exit. 1006 func (e *endpoint) completeWorkerLocked() { 1007 // Worker is terminating(either due to moving to 1008 // CLOSED or ERROR state, ensure we release all 1009 // registrations port reservations even if the socket 1010 // itself is not yet closed by the application. 1011 e.workerRunning = false 1012 if e.workerCleanup { 1013 e.cleanupLocked() 1014 } 1015 } 1016 1017 // transitionToStateCloseLocked ensures that the endpoint is 1018 // cleaned up from the transport demuxer, "before" moving to 1019 // StateClose. This will ensure that no packet will be 1020 // delivered to this endpoint from the demuxer when the endpoint 1021 // is transitioned to StateClose. 1022 func (e *endpoint) transitionToStateCloseLocked() { 1023 s := e.EndpointState() 1024 if s == StateClose { 1025 return 1026 } 1027 1028 if s.connected() { 1029 e.stack.Stats().TCP.CurrentConnected.Decrement() 1030 e.stack.Stats().TCP.EstablishedClosed.Increment() 1031 } 1032 1033 // Mark the endpoint as fully closed for reads/writes. 1034 e.cleanupLocked() 1035 e.setEndpointState(StateClose) 1036 } 1037 1038 // tryDeliverSegmentFromClosedEndpoint attempts to deliver the parsed 1039 // segment to any other endpoint other than the current one. This is called 1040 // only when the endpoint is in StateClose and we want to deliver the segment 1041 // to any other listening endpoint. We reply with RST if we cannot find one. 1042 func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) { 1043 ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.TransportEndpointInfo.ID, s.nicID) 1044 if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.TransportEndpointInfo.ID.LocalAddress.To4() != "" { 1045 // Dual-stack socket, try IPv4. 1046 ep = e.stack.FindTransportEndpoint( 1047 header.IPv4ProtocolNumber, 1048 e.TransProto, 1049 e.TransportEndpointInfo.ID, 1050 s.nicID, 1051 ) 1052 } 1053 if ep == nil { 1054 replyWithReset(e.stack, s, stack.DefaultTOS, 0 /* ttl */) 1055 s.decRef() 1056 return 1057 } 1058 1059 if e == ep { 1060 panic("current endpoint not removed from demuxer, enqueing segments to itself") 1061 } 1062 1063 if ep := ep.(*endpoint); ep.enqueueSegment(s) { 1064 ep.newSegmentWaker.Assert() 1065 } 1066 } 1067 1068 // Drain segment queue from the endpoint and try to re-match the segment to a 1069 // different endpoint. This is used when the current endpoint is transitioned to 1070 // StateClose and has been unregistered from the transport demuxer. 1071 func (e *endpoint) drainClosingSegmentQueue() { 1072 for { 1073 s := e.segmentQueue.dequeue() 1074 if s == nil { 1075 break 1076 } 1077 1078 e.tryDeliverSegmentFromClosedEndpoint(s) 1079 } 1080 } 1081 1082 func (e *endpoint) handleReset(s *segment) (ok bool, err tcpip.Error) { 1083 if e.rcv.acceptable(s.sequenceNumber, 0) { 1084 // RFC 793, page 37 states that "in all states 1085 // except SYN-SENT, all reset (RST) segments are 1086 // validated by checking their SEQ-fields." So 1087 // we only process it if it's acceptable. 1088 switch e.EndpointState() { 1089 // In case of a RST in CLOSE-WAIT linux moves 1090 // the socket to closed state with an error set 1091 // to indicate EPIPE. 1092 // 1093 // Technically this seems to be at odds w/ RFC. 1094 // As per https://tools.ietf.org/html/rfc793#section-2.7 1095 // page 69 the behavior for a segment arriving 1096 // w/ RST bit set in CLOSE-WAIT is inlined below. 1097 // 1098 // ESTABLISHED 1099 // FIN-WAIT-1 1100 // FIN-WAIT-2 1101 // CLOSE-WAIT 1102 1103 // If the RST bit is set then, any outstanding RECEIVEs and 1104 // SEND should receive "reset" responses. All segment queues 1105 // should be flushed. Users should also receive an unsolicited 1106 // general "connection reset" signal. Enter the CLOSED state, 1107 // delete the TCB, and return. 1108 case StateCloseWait: 1109 e.transitionToStateCloseLocked() 1110 e.hardError = &tcpip.ErrAborted{} 1111 e.notifyProtocolGoroutine(notifyTickleWorker) 1112 return false, nil 1113 default: 1114 // RFC 793, page 37 states that "in all states 1115 // except SYN-SENT, all reset (RST) segments are 1116 // validated by checking their SEQ-fields." So 1117 // we only process it if it's acceptable. 1118 1119 // Notify protocol goroutine. This is required when 1120 // handleSegment is invoked from the processor goroutine 1121 // rather than the worker goroutine. 1122 e.notifyProtocolGoroutine(notifyResetByPeer) 1123 return false, &tcpip.ErrConnectionReset{} 1124 } 1125 } 1126 return true, nil 1127 } 1128 1129 // handleSegments processes all inbound segments. 1130 // 1131 // Precondition: e.mu must be held. 1132 func (e *endpoint) handleSegmentsLocked(fastPath bool) tcpip.Error { 1133 checkRequeue := true 1134 for i := 0; i < maxSegmentsPerWake; i++ { 1135 if state := e.EndpointState(); state.closed() || state == StateTimeWait { 1136 return nil 1137 } 1138 s := e.segmentQueue.dequeue() 1139 if s == nil { 1140 checkRequeue = false 1141 break 1142 } 1143 1144 cont, err := e.handleSegmentLocked(s) 1145 s.decRef() 1146 if err != nil { 1147 return err 1148 } 1149 if !cont { 1150 return nil 1151 } 1152 } 1153 1154 // When fastPath is true we don't want to wake up the worker 1155 // goroutine. If the endpoint has more segments to process the 1156 // dispatcher will call handleSegments again anyway. 1157 if !fastPath && checkRequeue && !e.segmentQueue.empty() { 1158 e.newSegmentWaker.Assert() 1159 } 1160 1161 // Send an ACK for all processed packets if needed. 1162 if e.rcv.RcvNxt != e.snd.MaxSentAck { 1163 e.snd.sendAck() 1164 } 1165 1166 e.resetKeepaliveTimer(true /* receivedData */) 1167 1168 return nil 1169 } 1170 1171 // Precondition: e.mu must be held. 1172 func (e *endpoint) probeSegmentLocked() { 1173 if fn := e.probe; fn != nil { 1174 fn(e.completeStateLocked()) 1175 } 1176 } 1177 1178 // handleSegment handles a given segment and notifies the worker goroutine if 1179 // if the connection should be terminated. 1180 // 1181 // Precondition: e.mu must be held. 1182 func (e *endpoint) handleSegmentLocked(s *segment) (cont bool, err tcpip.Error) { 1183 // Invoke the tcp probe if installed. The tcp probe function will update 1184 // the TCPEndpointState after the segment is processed. 1185 defer e.probeSegmentLocked() 1186 1187 if s.flags.Contains(header.TCPFlagRst) { 1188 if ok, err := e.handleReset(s); !ok { 1189 return false, err 1190 } 1191 } else if s.flags.Contains(header.TCPFlagSyn) { 1192 // See: https://tools.ietf.org/html/rfc5961#section-4.1 1193 // 1) If the SYN bit is set, irrespective of the sequence number, TCP 1194 // MUST send an ACK (also referred to as challenge ACK) to the remote 1195 // peer: 1196 // 1197 // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK> 1198 // 1199 // After sending the acknowledgment, TCP MUST drop the unacceptable 1200 // segment and stop processing further. 1201 // 1202 // By sending an ACK, the remote peer is challenged to confirm the loss 1203 // of the previous connection and the request to start a new connection. 1204 // A legitimate peer, after restart, would not have a TCB in the 1205 // synchronized state. Thus, when the ACK arrives, the peer should send 1206 // a RST segment back with the sequence number derived from the ACK 1207 // field that caused the RST. 1208 1209 // This RST will confirm that the remote peer has indeed closed the 1210 // previous connection. Upon receipt of a valid RST, the local TCP 1211 // endpoint MUST terminate its connection. The local TCP endpoint 1212 // should then rely on SYN retransmission from the remote end to 1213 // re-establish the connection. 1214 e.snd.maybeSendOutOfWindowAck(s) 1215 } else if s.flags.Contains(header.TCPFlagAck) { 1216 // Patch the window size in the segment according to the 1217 // send window scale. 1218 s.window <<= e.snd.SndWndScale 1219 1220 // RFC 793, page 41 states that "once in the ESTABLISHED 1221 // state all segments must carry current acknowledgment 1222 // information." 1223 drop, err := e.rcv.handleRcvdSegment(s) 1224 if err != nil { 1225 return false, err 1226 } 1227 if drop { 1228 return true, nil 1229 } 1230 1231 // Now check if the received segment has caused us to transition 1232 // to a CLOSED state, if yes then terminate processing and do 1233 // not invoke the sender. 1234 state := e.EndpointState() 1235 if state == StateClose { 1236 // When we get into StateClose while processing from the queue, 1237 // return immediately and let the protocolMainloop handle it. 1238 // 1239 // We can reach StateClose only while processing a previous segment 1240 // or a notification from the protocolMainLoop (caller goroutine). 1241 // This means that with this return, the segment dequeue below can 1242 // never occur on a closed endpoint. 1243 return false, nil 1244 } 1245 1246 e.snd.handleRcvdSegment(s) 1247 } 1248 1249 return true, nil 1250 } 1251 1252 // keepaliveTimerExpired is called when the keepaliveTimer fires. We send TCP 1253 // keepalive packets periodically when the connection is idle. If we don't hear 1254 // from the other side after a number of tries, we terminate the connection. 1255 func (e *endpoint) keepaliveTimerExpired() tcpip.Error { 1256 userTimeout := e.userTimeout 1257 1258 e.keepalive.Lock() 1259 if !e.SocketOptions().GetKeepAlive() || !e.keepalive.timer.checkExpiration() { 1260 e.keepalive.Unlock() 1261 return nil 1262 } 1263 1264 // If a userTimeout is set then abort the connection if it is 1265 // exceeded. 1266 if userTimeout != 0 && e.stack.Clock().NowMonotonic().Sub(e.rcv.lastRcvdAckTime) >= userTimeout && e.keepalive.unacked > 0 { 1267 e.keepalive.Unlock() 1268 e.stack.Stats().TCP.EstablishedTimedout.Increment() 1269 return &tcpip.ErrTimeout{} 1270 } 1271 1272 if e.keepalive.unacked >= e.keepalive.count { 1273 e.keepalive.Unlock() 1274 e.stack.Stats().TCP.EstablishedTimedout.Increment() 1275 return &tcpip.ErrTimeout{} 1276 } 1277 1278 // RFC1122 4.2.3.6: TCP keepalive is a dataless ACK with 1279 // seg.seq = snd.nxt-1. 1280 e.keepalive.unacked++ 1281 e.keepalive.Unlock() 1282 e.snd.sendSegmentFromView(buffer.VectorisedView{}, header.TCPFlagAck, e.snd.SndNxt-1) 1283 e.resetKeepaliveTimer(false) 1284 return nil 1285 } 1286 1287 // resetKeepaliveTimer restarts or stops the keepalive timer, depending on 1288 // whether it is enabled for this endpoint. 1289 func (e *endpoint) resetKeepaliveTimer(receivedData bool) { 1290 e.keepalive.Lock() 1291 if receivedData { 1292 e.keepalive.unacked = 0 1293 } 1294 // Start the keepalive timer IFF it's enabled and there is no pending 1295 // data to send. 1296 if !e.SocketOptions().GetKeepAlive() || e.snd == nil || e.snd.SndUna != e.snd.SndNxt { 1297 e.keepalive.timer.disable() 1298 e.keepalive.Unlock() 1299 return 1300 } 1301 if e.keepalive.unacked > 0 { 1302 e.keepalive.timer.enable(e.keepalive.interval) 1303 } else { 1304 e.keepalive.timer.enable(e.keepalive.idle) 1305 } 1306 e.keepalive.Unlock() 1307 } 1308 1309 // disableKeepaliveTimer stops the keepalive timer. 1310 func (e *endpoint) disableKeepaliveTimer() { 1311 e.keepalive.Lock() 1312 e.keepalive.timer.disable() 1313 e.keepalive.Unlock() 1314 } 1315 1316 // protocolMainLoopDone is called at the end of protocolMainLoop. 1317 // +checklocksrelease:e.mu 1318 func (e *endpoint) protocolMainLoopDone(closeTimer tcpip.Timer) { 1319 if e.snd != nil { 1320 e.snd.resendTimer.cleanup() 1321 e.snd.probeTimer.cleanup() 1322 e.snd.reorderTimer.cleanup() 1323 } 1324 1325 if closeTimer != nil { 1326 closeTimer.Stop() 1327 } 1328 1329 e.completeWorkerLocked() 1330 1331 if e.drainDone != nil { 1332 close(e.drainDone) 1333 } 1334 1335 e.mu.Unlock() 1336 1337 e.drainClosingSegmentQueue() 1338 1339 // When the protocol loop exits we should wake up our waiters. 1340 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 1341 } 1342 1343 // handleWakeup handles a wakeup event while connected. 1344 // 1345 // +checklocks:e.mu 1346 func (e *endpoint) handleWakeup(w, closeWaker *sleep.Waker, closeTimer *tcpip.Timer) tcpip.Error { 1347 switch w { 1348 case &e.sndQueueInfo.sndWaker: 1349 e.sendData(nil /* next */) 1350 case &e.newSegmentWaker: 1351 return e.handleSegmentsLocked(false /* fastPath */) 1352 case &e.snd.resendWaker: 1353 if !e.snd.retransmitTimerExpired() { 1354 e.stack.Stats().TCP.EstablishedTimedout.Increment() 1355 return &tcpip.ErrTimeout{} 1356 } 1357 case closeWaker: 1358 // This means the socket is being closed due to the 1359 // TCP-FIN-WAIT2 timeout was hit. Just mark the socket as 1360 // closed. 1361 e.transitionToStateCloseLocked() 1362 e.workerCleanup = true 1363 case &e.snd.probeWaker: 1364 return e.snd.probeTimerExpired() 1365 case &e.keepalive.waker: 1366 return e.keepaliveTimerExpired() 1367 case &e.notificationWaker: 1368 n := e.fetchNotifications() 1369 if n¬ifyNonZeroReceiveWindow != 0 { 1370 e.rcv.nonZeroWindow() 1371 } 1372 1373 if n¬ifyMTUChanged != 0 { 1374 e.sndQueueInfo.sndQueueMu.Lock() 1375 count := e.sndQueueInfo.PacketTooBigCount 1376 e.sndQueueInfo.PacketTooBigCount = 0 1377 mtu := e.sndQueueInfo.SndMTU 1378 e.sndQueueInfo.sndQueueMu.Unlock() 1379 1380 e.snd.updateMaxPayloadSize(mtu, count) 1381 } 1382 1383 if n¬ifyReset != 0 || n¬ifyAbort != 0 { 1384 return &tcpip.ErrConnectionAborted{} 1385 } 1386 1387 if n¬ifyResetByPeer != 0 { 1388 return &tcpip.ErrConnectionReset{} 1389 } 1390 1391 if n¬ifyClose != 0 && e.closed { 1392 switch e.EndpointState() { 1393 case StateEstablished: 1394 // Perform full shutdown if the endpoint is 1395 // still established. This can occur when 1396 // notifyClose was asserted just before 1397 // becoming established. 1398 e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead) 1399 case StateFinWait2: 1400 // The socket has been closed and we are in 1401 // FIN_WAIT2 so start the FIN_WAIT2 timer. 1402 if *closeTimer == nil { 1403 *closeTimer = e.stack.Clock().AfterFunc(e.tcpLingerTimeout, closeWaker.Assert) 1404 } 1405 } 1406 } 1407 1408 if n¬ifyKeepaliveChanged != 0 { 1409 // The timer could fire in background when the endpoint 1410 // is drained. That's OK. See above. 1411 e.resetKeepaliveTimer(true) 1412 } 1413 1414 if n¬ifyDrain != 0 { 1415 for !e.segmentQueue.empty() { 1416 if err := e.handleSegmentsLocked(false /* fastPath */); err != nil { 1417 return err 1418 } 1419 } 1420 if !e.EndpointState().closed() { 1421 // Only block the worker if the endpoint 1422 // is not in closed state or error state. 1423 close(e.drainDone) 1424 e.mu.Unlock() 1425 <-e.undrain 1426 e.mu.Lock() 1427 } 1428 } 1429 1430 // N.B. notifyTickleWorker may be set, but there is no action 1431 // to take in this case. 1432 case &e.snd.reorderWaker: 1433 return e.snd.rc.reorderTimerExpired() 1434 default: 1435 panic("unknown waker") // Shouldn't happen. 1436 } 1437 return nil 1438 } 1439 1440 // protocolMainLoop is the main loop of the TCP protocol. It runs in its own 1441 // goroutine and is responsible for sending segments and handling received 1442 // segments. 1443 func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) { 1444 var ( 1445 closeTimer tcpip.Timer 1446 closeWaker sleep.Waker 1447 ) 1448 1449 e.mu.Lock() 1450 if handshake { 1451 if err := e.h.complete(); err != nil { // +checklocksforce 1452 e.lastErrorMu.Lock() 1453 e.lastError = err 1454 e.lastErrorMu.Unlock() 1455 1456 e.setEndpointState(StateError) 1457 e.hardError = err 1458 1459 e.workerCleanup = true 1460 e.protocolMainLoopDone(closeTimer) 1461 return 1462 } 1463 } 1464 1465 // Reaching this point means that we successfully completed the 3-way 1466 // handshake with our peer. The current endpoint state could be any state 1467 // post ESTABLISHED, including CLOSED or ERROR if the endpoint processes a 1468 // RST from the peer via the dispatcher fast path, before the loop is 1469 // started. 1470 if s := e.EndpointState(); !s.connected() { 1471 switch s { 1472 case StateClose, StateError: 1473 // If the endpoint is in CLOSED/ERROR state, sender state has to be 1474 // initialized if the endpoint was previously established. 1475 if e.snd != nil { 1476 break 1477 } 1478 fallthrough 1479 default: 1480 panic("endpoint was not established, current state " + s.String()) 1481 } 1482 } 1483 1484 // Completing the 3-way handshake is an indication that the route is valid 1485 // and the remote is reachable as the only way we can complete a handshake 1486 // is if our SYN reached the remote and their ACK reached us. 1487 e.route.ConfirmReachable() 1488 1489 drained := e.drainDone != nil 1490 if drained { 1491 close(e.drainDone) 1492 e.mu.Unlock() 1493 <-e.undrain 1494 e.mu.Lock() 1495 } 1496 1497 // Add all wakers. 1498 var s sleep.Sleeper 1499 s.AddWaker(&e.sndQueueInfo.sndWaker) 1500 s.AddWaker(&e.newSegmentWaker) 1501 s.AddWaker(&e.snd.resendWaker) 1502 s.AddWaker(&e.snd.probeWaker) 1503 s.AddWaker(&closeWaker) 1504 s.AddWaker(&e.keepalive.waker) 1505 s.AddWaker(&e.notificationWaker) 1506 s.AddWaker(&e.snd.reorderWaker) 1507 1508 // Notify the caller that the waker initialization is complete and the 1509 // endpoint is ready. 1510 if wakerInitDone != nil { 1511 close(wakerInitDone) 1512 } 1513 1514 // Tell waiters that the endpoint is connected and writable. 1515 e.waiterQueue.Notify(waiter.WritableEvents) 1516 1517 // The following assertions and notifications are needed for restored 1518 // endpoints. Fresh newly created endpoints have empty states and should 1519 // not invoke any. 1520 if !e.segmentQueue.empty() { 1521 e.newSegmentWaker.Assert() 1522 } 1523 1524 e.rcvQueueInfo.rcvQueueMu.Lock() 1525 if !e.rcvQueueInfo.rcvQueue.Empty() { 1526 e.waiterQueue.Notify(waiter.ReadableEvents) 1527 } 1528 e.rcvQueueInfo.rcvQueueMu.Unlock() 1529 1530 if e.workerCleanup { 1531 e.notifyProtocolGoroutine(notifyClose) 1532 } 1533 1534 // Main loop. Handle segments until both send and receive ends of the 1535 // connection have completed. 1536 cleanupOnError := func(err tcpip.Error) { 1537 e.stack.Stats().TCP.CurrentConnected.Decrement() 1538 e.workerCleanup = true 1539 if err != nil { 1540 e.resetConnectionLocked(err) 1541 } 1542 } 1543 1544 loop: 1545 for { 1546 switch e.EndpointState() { 1547 case StateTimeWait, StateClose, StateError: 1548 break loop 1549 } 1550 1551 e.mu.Unlock() 1552 w := s.Fetch(true /* block */) 1553 e.mu.Lock() 1554 1555 // We need to double check here because the notification may be 1556 // stale by the time we got around to processing it. 1557 switch e.EndpointState() { 1558 case StateError: 1559 // If the endpoint has already transitioned to an ERROR 1560 // state just pass nil here as any reset that may need 1561 // to be sent etc should already have been done and we 1562 // just want to terminate the loop and cleanup the 1563 // endpoint. 1564 cleanupOnError(nil) 1565 e.protocolMainLoopDone(closeTimer) 1566 return 1567 case StateTimeWait: 1568 fallthrough 1569 case StateClose: 1570 break loop 1571 default: 1572 if err := e.handleWakeup(w, &closeWaker, &closeTimer); err != nil { 1573 cleanupOnError(err) 1574 e.protocolMainLoopDone(closeTimer) 1575 return 1576 } 1577 } 1578 } 1579 1580 var reuseTW func() 1581 if e.EndpointState() == StateTimeWait { 1582 // Disable close timer as we now entering real TIME_WAIT. 1583 if closeTimer != nil { 1584 closeTimer.Stop() 1585 } 1586 // Mark the current sleeper done so as to free all associated 1587 // wakers. 1588 s.Done() 1589 // Wake up any waiters before we enter TIME_WAIT. 1590 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 1591 e.workerCleanup = true 1592 reuseTW = e.doTimeWait() 1593 } 1594 1595 // Handle any StateError transition from StateTimeWait. 1596 if e.EndpointState() == StateError { 1597 cleanupOnError(nil) 1598 e.protocolMainLoopDone(closeTimer) 1599 return 1600 } 1601 1602 e.transitionToStateCloseLocked() 1603 1604 e.protocolMainLoopDone(closeTimer) 1605 1606 // A new SYN was received during TIME_WAIT and we need to abort 1607 // the timewait and redirect the segment to the listener queue 1608 if reuseTW != nil { 1609 reuseTW() 1610 } 1611 } 1612 1613 // handleTimeWaitSegments processes segments received during TIME_WAIT 1614 // state. 1615 func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()) { 1616 checkRequeue := true 1617 for i := 0; i < maxSegmentsPerWake; i++ { 1618 s := e.segmentQueue.dequeue() 1619 if s == nil { 1620 checkRequeue = false 1621 break 1622 } 1623 extTW, newSyn := e.rcv.handleTimeWaitSegment(s) 1624 if newSyn { 1625 info := e.TransportEndpointInfo 1626 newID := info.ID 1627 newID.RemoteAddress = "" 1628 newID.RemotePort = 0 1629 netProtos := []tcpip.NetworkProtocolNumber{info.NetProto} 1630 // If the local address is an IPv4 address then also 1631 // look for IPv6 dual stack endpoints that might be 1632 // listening on the local address. 1633 if newID.LocalAddress.To4() != "" { 1634 netProtos = []tcpip.NetworkProtocolNumber{header.IPv4ProtocolNumber, header.IPv6ProtocolNumber} 1635 } 1636 for _, netProto := range netProtos { 1637 if listenEP := e.stack.FindTransportEndpoint(netProto, info.TransProto, newID, s.nicID); listenEP != nil { 1638 tcpEP := listenEP.(*endpoint) 1639 if EndpointState(tcpEP.State()) == StateListen { 1640 reuseTW = func() { 1641 if !tcpEP.enqueueSegment(s) { 1642 s.decRef() 1643 return 1644 } 1645 tcpEP.newSegmentWaker.Assert() 1646 } 1647 // We explicitly do not decRef 1648 // the segment as it's still 1649 // valid and being reflected to 1650 // a listening endpoint. 1651 return false, reuseTW 1652 } 1653 } 1654 } 1655 } 1656 if extTW { 1657 extendTimeWait = true 1658 } 1659 s.decRef() 1660 } 1661 if checkRequeue && !e.segmentQueue.empty() { 1662 e.newSegmentWaker.Assert() 1663 } 1664 return extendTimeWait, nil 1665 } 1666 1667 // doTimeWait is responsible for handling the TCP behaviour once a socket 1668 // enters the TIME_WAIT state. Optionally it can return a closure that 1669 // should be executed after releasing the endpoint registrations. This is 1670 // done in cases where a new SYN is received during TIME_WAIT that carries 1671 // a sequence number larger than one see on the connection. 1672 // +checklocks:e.mu 1673 func (e *endpoint) doTimeWait() (twReuse func()) { 1674 // Trigger a 2 * MSL time wait state. During this period 1675 // we will drop all incoming segments. 1676 // NOTE: On Linux this is not configurable and is fixed at 60 seconds. 1677 timeWaitDuration := DefaultTCPTimeWaitTimeout 1678 1679 // Get the stack wide configuration. 1680 var tcpTW tcpip.TCPTimeWaitTimeoutOption 1681 if err := e.stack.TransportProtocolOption(ProtocolNumber, &tcpTW); err == nil { 1682 timeWaitDuration = time.Duration(tcpTW) 1683 } 1684 1685 var s sleep.Sleeper 1686 defer s.Done() 1687 s.AddWaker(&e.newSegmentWaker) 1688 s.AddWaker(&e.notificationWaker) 1689 1690 var timeWaitWaker sleep.Waker 1691 s.AddWaker(&timeWaitWaker) 1692 timeWaitTimer := e.stack.Clock().AfterFunc(timeWaitDuration, timeWaitWaker.Assert) 1693 defer timeWaitTimer.Stop() 1694 1695 for { 1696 e.mu.Unlock() 1697 w := s.Fetch(true /* block */) 1698 e.mu.Lock() 1699 switch w { 1700 case &e.newSegmentWaker: 1701 extendTimeWait, reuseTW := e.handleTimeWaitSegments() 1702 if reuseTW != nil { 1703 return reuseTW 1704 } 1705 if extendTimeWait { 1706 timeWaitTimer.Reset(timeWaitDuration) 1707 } 1708 case &e.notificationWaker: 1709 n := e.fetchNotifications() 1710 if n¬ifyAbort != 0 { 1711 return nil 1712 } 1713 if n¬ifyDrain != 0 { 1714 for !e.segmentQueue.empty() { 1715 // Ignore extending TIME_WAIT during a 1716 // save. For sockets in TIME_WAIT we just 1717 // terminate the TIME_WAIT early. 1718 e.handleTimeWaitSegments() 1719 } 1720 close(e.drainDone) 1721 e.mu.Unlock() 1722 <-e.undrain 1723 e.mu.Lock() 1724 return nil 1725 } 1726 case &timeWaitWaker: 1727 return nil 1728 } 1729 } 1730 }