github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/tcpip/transport/tcp/connect.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "encoding/binary" 19 "math" 20 "time" 21 22 "github.com/SagerNet/gvisor/pkg/sleep" 23 "github.com/SagerNet/gvisor/pkg/sync" 24 "github.com/SagerNet/gvisor/pkg/tcpip" 25 "github.com/SagerNet/gvisor/pkg/tcpip/buffer" 26 "github.com/SagerNet/gvisor/pkg/tcpip/hash/jenkins" 27 "github.com/SagerNet/gvisor/pkg/tcpip/header" 28 "github.com/SagerNet/gvisor/pkg/tcpip/seqnum" 29 "github.com/SagerNet/gvisor/pkg/tcpip/stack" 30 "github.com/SagerNet/gvisor/pkg/waiter" 31 ) 32 33 // maxSegmentsPerWake is the maximum number of segments to process in the main 34 // protocol goroutine per wake-up. Yielding [after this number of segments are 35 // processed] allows other events to be processed as well (e.g., timeouts, 36 // resets, etc.). 37 const maxSegmentsPerWake = 100 38 39 type handshakeState int 40 41 // The following are the possible states of the TCP connection during a 3-way 42 // handshake. A depiction of the states and transitions can be found in RFC 793, 43 // page 23. 44 const ( 45 handshakeSynSent handshakeState = iota 46 handshakeSynRcvd 47 handshakeCompleted 48 ) 49 50 // The following are used to set up sleepers. 51 const ( 52 wakerForNotification = iota 53 wakerForNewSegment 54 wakerForResend 55 ) 56 57 const ( 58 // Maximum space available for options. 59 maxOptionSize = 40 60 ) 61 62 // handshake holds the state used during a TCP 3-way handshake. 63 // 64 // NOTE: handshake.ep.mu is held during handshake processing. It is released if 65 // we are going to block and reacquired when we start processing an event. 66 type handshake struct { 67 ep *endpoint 68 listenEP *endpoint 69 state handshakeState 70 active bool 71 flags header.TCPFlags 72 ackNum seqnum.Value 73 74 // iss is the initial send sequence number, as defined in RFC 793. 75 iss seqnum.Value 76 77 // rcvWnd is the receive window, as defined in RFC 793. 78 rcvWnd seqnum.Size 79 80 // sndWnd is the send window, as defined in RFC 793. 81 sndWnd seqnum.Size 82 83 // mss is the maximum segment size received from the peer. 84 mss uint16 85 86 // sndWndScale is the send window scale, as defined in RFC 1323. A 87 // negative value means no scaling is supported by the peer. 88 sndWndScale int 89 90 // rcvWndScale is the receive window scale, as defined in RFC 1323. 91 rcvWndScale int 92 93 // startTime is the time at which the first SYN/SYN-ACK was sent. 94 startTime tcpip.MonotonicTime 95 96 // deferAccept if non-zero will drop the final ACK for a passive 97 // handshake till an ACK segment with data is received or the timeout is 98 // hit. 99 deferAccept time.Duration 100 101 // acked is true if the the final ACK for a 3-way handshake has 102 // been received. This is required to stop retransmitting the 103 // original SYN-ACK when deferAccept is enabled. 104 acked bool 105 106 // sendSYNOpts is the cached values for the SYN options to be sent. 107 sendSYNOpts header.TCPSynOptions 108 } 109 110 func (e *endpoint) newHandshake() *handshake { 111 h := &handshake{ 112 ep: e, 113 active: true, 114 rcvWnd: seqnum.Size(e.initialReceiveWindow()), 115 rcvWndScale: e.rcvWndScaleForHandshake(), 116 } 117 h.resetState() 118 // Store reference to handshake state in endpoint. 119 e.h = h 120 return h 121 } 122 123 func (e *endpoint) newPassiveHandshake(isn, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) *handshake { 124 h := e.newHandshake() 125 h.resetToSynRcvd(isn, irs, opts, deferAccept) 126 return h 127 } 128 129 // FindWndScale determines the window scale to use for the given maximum window 130 // size. 131 func FindWndScale(wnd seqnum.Size) int { 132 if wnd < 0x10000 { 133 return 0 134 } 135 136 max := seqnum.Size(math.MaxUint16) 137 s := 0 138 for wnd > max && s < header.MaxWndScale { 139 s++ 140 max <<= 1 141 } 142 143 return s 144 } 145 146 // resetState resets the state of the handshake object such that it becomes 147 // ready for a new 3-way handshake. 148 func (h *handshake) resetState() { 149 h.state = handshakeSynSent 150 h.flags = header.TCPFlagSyn 151 h.ackNum = 0 152 h.mss = 0 153 h.iss = generateSecureISN(h.ep.TransportEndpointInfo.ID, h.ep.stack.Clock(), h.ep.stack.Seed()) 154 } 155 156 // generateSecureISN generates a secure Initial Sequence number based on the 157 // recommendation here https://tools.ietf.org/html/rfc6528#page-3. 158 func generateSecureISN(id stack.TransportEndpointID, clock tcpip.Clock, seed uint32) seqnum.Value { 159 isnHasher := jenkins.Sum32(seed) 160 isnHasher.Write([]byte(id.LocalAddress)) 161 isnHasher.Write([]byte(id.RemoteAddress)) 162 portBuf := make([]byte, 2) 163 binary.LittleEndian.PutUint16(portBuf, id.LocalPort) 164 isnHasher.Write(portBuf) 165 binary.LittleEndian.PutUint16(portBuf, id.RemotePort) 166 isnHasher.Write(portBuf) 167 // The time period here is 64ns. This is similar to what linux uses 168 // generate a sequence number that overlaps less than one 169 // time per MSL (2 minutes). 170 // 171 // A 64ns clock ticks 10^9/64 = 15625000) times in a second. 172 // To wrap the whole 32 bit space would require 173 // 2^32/1562500 ~ 274 seconds. 174 // 175 // Which sort of guarantees that we won't reuse the ISN for a new 176 // connection for the same tuple for at least 274s. 177 isn := isnHasher.Sum32() + uint32(clock.NowMonotonic().Sub(tcpip.MonotonicTime{}).Nanoseconds()>>6) 178 return seqnum.Value(isn) 179 } 180 181 // effectiveRcvWndScale returns the effective receive window scale to be used. 182 // If the peer doesn't support window scaling, the effective rcv wnd scale is 183 // zero; otherwise it's the value calculated based on the initial rcv wnd. 184 func (h *handshake) effectiveRcvWndScale() uint8 { 185 if h.sndWndScale < 0 { 186 return 0 187 } 188 return uint8(h.rcvWndScale) 189 } 190 191 // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD 192 // state. 193 func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) { 194 h.active = false 195 h.state = handshakeSynRcvd 196 h.flags = header.TCPFlagSyn | header.TCPFlagAck 197 h.iss = iss 198 h.ackNum = irs + 1 199 h.mss = opts.MSS 200 h.sndWndScale = opts.WS 201 h.deferAccept = deferAccept 202 h.ep.setEndpointState(StateSynRecv) 203 } 204 205 // checkAck checks if the ACK number, if present, of a segment received during 206 // a TCP 3-way handshake is valid. If it's not, a RST segment is sent back in 207 // response. 208 func (h *handshake) checkAck(s *segment) bool { 209 if s.flags.Contains(header.TCPFlagAck) && s.ackNumber != h.iss+1 { 210 // RFC 793, page 36, states that a reset must be generated when 211 // the connection is in any non-synchronized state and an 212 // incoming segment acknowledges something not yet sent. The 213 // connection remains in the same state. 214 ack := s.sequenceNumber.Add(s.logicalLen()) 215 h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, s.ackNumber, ack, 0) 216 return false 217 } 218 219 return true 220 } 221 222 // synSentState handles a segment received when the TCP 3-way handshake is in 223 // the SYN-SENT state. 224 func (h *handshake) synSentState(s *segment) tcpip.Error { 225 // RFC 793, page 37, states that in the SYN-SENT state, a reset is 226 // acceptable if the ack field acknowledges the SYN. 227 if s.flags.Contains(header.TCPFlagRst) { 228 if s.flags.Contains(header.TCPFlagAck) && s.ackNumber == h.iss+1 { 229 // RFC 793, page 67, states that "If the RST bit is set [and] If the ACK 230 // was acceptable then signal the user "error: connection reset", drop 231 // the segment, enter CLOSED state, delete TCB, and return." 232 h.ep.workerCleanup = true 233 // Although the RFC above calls out ECONNRESET, Linux actually returns 234 // ECONNREFUSED here so we do as well. 235 return &tcpip.ErrConnectionRefused{} 236 } 237 return nil 238 } 239 240 if !h.checkAck(s) { 241 return nil 242 } 243 244 // We are in the SYN-SENT state. We only care about segments that have 245 // the SYN flag. 246 if !s.flags.Contains(header.TCPFlagSyn) { 247 return nil 248 } 249 250 // Parse the SYN options. 251 rcvSynOpts := parseSynSegmentOptions(s) 252 253 // Remember if the Timestamp option was negotiated. 254 h.ep.maybeEnableTimestamp(&rcvSynOpts) 255 256 // Remember if the SACKPermitted option was negotiated. 257 h.ep.maybeEnableSACKPermitted(&rcvSynOpts) 258 259 // Remember the sequence we'll ack from now on. 260 h.ackNum = s.sequenceNumber + 1 261 h.flags |= header.TCPFlagAck 262 h.mss = rcvSynOpts.MSS 263 h.sndWndScale = rcvSynOpts.WS 264 265 // If this is a SYN ACK response, we only need to acknowledge the SYN 266 // and the handshake is completed. 267 if s.flags.Contains(header.TCPFlagAck) { 268 h.state = handshakeCompleted 269 270 h.ep.transitionToStateEstablishedLocked(h) 271 272 h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale()) 273 return nil 274 } 275 276 // A SYN segment was received, but no ACK in it. We acknowledge the SYN 277 // but resend our own SYN and wait for it to be acknowledged in the 278 // SYN-RCVD state. 279 h.state = handshakeSynRcvd 280 ttl := h.ep.ttl 281 amss := h.ep.amss 282 h.ep.setEndpointState(StateSynRecv) 283 synOpts := header.TCPSynOptions{ 284 WS: int(h.effectiveRcvWndScale()), 285 TS: rcvSynOpts.TS, 286 TSVal: h.ep.timestamp(), 287 TSEcr: h.ep.recentTimestamp(), 288 289 // We only send SACKPermitted if the other side indicated it 290 // permits SACK. This is not explicitly defined in the RFC but 291 // this is the behaviour implemented by Linux. 292 SACKPermitted: rcvSynOpts.SACKPermitted, 293 MSS: amss, 294 } 295 if ttl == 0 { 296 ttl = h.ep.route.DefaultTTL() 297 } 298 h.ep.sendSynTCP(h.ep.route, tcpFields{ 299 id: h.ep.TransportEndpointInfo.ID, 300 ttl: ttl, 301 tos: h.ep.sendTOS, 302 flags: h.flags, 303 seq: h.iss, 304 ack: h.ackNum, 305 rcvWnd: h.rcvWnd, 306 }, synOpts) 307 return nil 308 } 309 310 // synRcvdState handles a segment received when the TCP 3-way handshake is in 311 // the SYN-RCVD state. 312 func (h *handshake) synRcvdState(s *segment) tcpip.Error { 313 if s.flags.Contains(header.TCPFlagRst) { 314 // RFC 793, page 37, states that in the SYN-RCVD state, a reset 315 // is acceptable if the sequence number is in the window. 316 if s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) { 317 return &tcpip.ErrConnectionRefused{} 318 } 319 return nil 320 } 321 322 if !h.checkAck(s) { 323 return nil 324 } 325 326 // RFC 793, Section 3.9, page 69, states that in the SYN-RCVD state, a 327 // sequence number outside of the window causes an ACK with the proper seq 328 // number and "After sending the acknowledgment, drop the unacceptable 329 // segment and return." 330 if !s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) { 331 if h.ep.allowOutOfWindowAck() { 332 h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd) 333 } 334 return nil 335 } 336 337 if s.flags.Contains(header.TCPFlagSyn) && s.sequenceNumber != h.ackNum-1 { 338 // We received two SYN segments with different sequence 339 // numbers, so we reset this and restart the whole 340 // process, except that we don't reset the timer. 341 ack := s.sequenceNumber.Add(s.logicalLen()) 342 seq := seqnum.Value(0) 343 if s.flags.Contains(header.TCPFlagAck) { 344 seq = s.ackNumber 345 } 346 h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0) 347 348 if !h.active { 349 return &tcpip.ErrInvalidEndpointState{} 350 } 351 352 h.resetState() 353 synOpts := header.TCPSynOptions{ 354 WS: h.rcvWndScale, 355 TS: h.ep.SendTSOk, 356 TSVal: h.ep.timestamp(), 357 TSEcr: h.ep.recentTimestamp(), 358 SACKPermitted: h.ep.SACKPermitted, 359 MSS: h.ep.amss, 360 } 361 h.ep.sendSynTCP(h.ep.route, tcpFields{ 362 id: h.ep.TransportEndpointInfo.ID, 363 ttl: h.ep.ttl, 364 tos: h.ep.sendTOS, 365 flags: h.flags, 366 seq: h.iss, 367 ack: h.ackNum, 368 rcvWnd: h.rcvWnd, 369 }, synOpts) 370 return nil 371 } 372 373 // We have previously received (and acknowledged) the peer's SYN. If the 374 // peer acknowledges our SYN, the handshake is completed. 375 if s.flags.Contains(header.TCPFlagAck) { 376 // If deferAccept is not zero and this is a bare ACK and the 377 // timeout is not hit then drop the ACK. 378 if h.deferAccept != 0 && s.data.Size() == 0 && h.ep.stack.Clock().NowMonotonic().Sub(h.startTime) < h.deferAccept { 379 h.acked = true 380 h.ep.stack.Stats().DroppedPackets.Increment() 381 return nil 382 } 383 384 // If the timestamp option is negotiated and the segment does 385 // not carry a timestamp option then the segment must be dropped 386 // as per https://tools.ietf.org/html/rfc7323#section-3.2. 387 if h.ep.SendTSOk && !s.parsedOptions.TS { 388 h.ep.stack.Stats().DroppedPackets.Increment() 389 return nil 390 } 391 392 // Drop the ACK if the accept queue is full. 393 // https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_ipv4.c#L1523 394 // We could abort the connection as well with a tunable as in 395 // https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_minisocks.c#L788 396 if listenEP := h.listenEP; listenEP != nil && listenEP.acceptQueueIsFull() { 397 listenEP.stack.Stats().DroppedPackets.Increment() 398 return nil 399 } 400 401 // Update timestamp if required. See RFC7323, section-4.3. 402 if h.ep.SendTSOk && s.parsedOptions.TS { 403 h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber) 404 } 405 h.state = handshakeCompleted 406 407 h.ep.transitionToStateEstablishedLocked(h) 408 409 // Requeue the segment if the ACK completing the handshake has more info 410 // to be procesed by the newly established endpoint. 411 if (s.flags.Contains(header.TCPFlagFin) || s.data.Size() > 0) && h.ep.enqueueSegment(s) { 412 s.incRef() 413 h.ep.newSegmentWaker.Assert() 414 } 415 return nil 416 } 417 418 return nil 419 } 420 421 func (h *handshake) handleSegment(s *segment) tcpip.Error { 422 h.sndWnd = s.window 423 if !s.flags.Contains(header.TCPFlagSyn) && h.sndWndScale > 0 { 424 h.sndWnd <<= uint8(h.sndWndScale) 425 } 426 427 switch h.state { 428 case handshakeSynRcvd: 429 return h.synRcvdState(s) 430 case handshakeSynSent: 431 return h.synSentState(s) 432 } 433 return nil 434 } 435 436 // processSegments goes through the segment queue and processes up to 437 // maxSegmentsPerWake (if they're available). 438 func (h *handshake) processSegments() tcpip.Error { 439 for i := 0; i < maxSegmentsPerWake; i++ { 440 s := h.ep.segmentQueue.dequeue() 441 if s == nil { 442 return nil 443 } 444 445 err := h.handleSegment(s) 446 s.decRef() 447 if err != nil { 448 return err 449 } 450 451 // We stop processing packets once the handshake is completed, 452 // otherwise we may process packets meant to be processed by 453 // the main protocol goroutine. 454 if h.state == handshakeCompleted { 455 break 456 } 457 } 458 459 // If the queue is not empty, make sure we'll wake up in the next 460 // iteration. 461 if !h.ep.segmentQueue.empty() { 462 h.ep.newSegmentWaker.Assert() 463 } 464 465 return nil 466 } 467 468 // start sends the first SYN/SYN-ACK. It does not block, even if link address 469 // resolution is required. 470 func (h *handshake) start() { 471 h.startTime = h.ep.stack.Clock().NowMonotonic() 472 h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route) 473 var sackEnabled tcpip.TCPSACKEnabled 474 if err := h.ep.stack.TransportProtocolOption(ProtocolNumber, &sackEnabled); err != nil { 475 // If stack returned an error when checking for SACKEnabled 476 // status then just default to switching off SACK negotiation. 477 sackEnabled = false 478 } 479 480 synOpts := header.TCPSynOptions{ 481 WS: h.rcvWndScale, 482 TS: true, 483 TSVal: h.ep.timestamp(), 484 TSEcr: h.ep.recentTimestamp(), 485 SACKPermitted: bool(sackEnabled), 486 MSS: h.ep.amss, 487 } 488 489 // start() is also called in a listen context so we want to make sure we only 490 // send the TS/SACK option when we received the TS/SACK in the initial SYN. 491 if h.state == handshakeSynRcvd { 492 synOpts.TS = h.ep.SendTSOk 493 synOpts.SACKPermitted = h.ep.SACKPermitted && bool(sackEnabled) 494 if h.sndWndScale < 0 { 495 // Disable window scaling if the peer did not send us 496 // the window scaling option. 497 synOpts.WS = -1 498 } 499 } 500 501 h.sendSYNOpts = synOpts 502 h.ep.sendSynTCP(h.ep.route, tcpFields{ 503 id: h.ep.TransportEndpointInfo.ID, 504 ttl: h.ep.ttl, 505 tos: h.ep.sendTOS, 506 flags: h.flags, 507 seq: h.iss, 508 ack: h.ackNum, 509 rcvWnd: h.rcvWnd, 510 }, synOpts) 511 } 512 513 // complete completes the TCP 3-way handshake initiated by h.start(). 514 // +checklocks:h.ep.mu 515 func (h *handshake) complete() tcpip.Error { 516 // Set up the wakers. 517 var s sleep.Sleeper 518 resendWaker := sleep.Waker{} 519 s.AddWaker(&resendWaker, wakerForResend) 520 s.AddWaker(&h.ep.notificationWaker, wakerForNotification) 521 s.AddWaker(&h.ep.newSegmentWaker, wakerForNewSegment) 522 defer s.Done() 523 524 // Initialize the resend timer. 525 timer, err := newBackoffTimer(h.ep.stack.Clock(), time.Second, MaxRTO, resendWaker.Assert) 526 if err != nil { 527 return err 528 } 529 defer timer.stop() 530 for h.state != handshakeCompleted { 531 // Unlock before blocking, and reacquire again afterwards (h.ep.mu is held 532 // throughout handshake processing). 533 h.ep.mu.Unlock() 534 index, _ := s.Fetch(true /* block */) 535 h.ep.mu.Lock() 536 switch index { 537 538 case wakerForResend: 539 if err := timer.reset(); err != nil { 540 return err 541 } 542 // Resend the SYN/SYN-ACK only if the following conditions hold. 543 // - It's an active handshake (deferAccept does not apply) 544 // - It's a passive handshake and we have not yet got the final-ACK. 545 // - It's a passive handshake and we got an ACK but deferAccept is 546 // enabled and we are now past the deferAccept duration. 547 // The last is required to provide a way for the peer to complete 548 // the connection with another ACK or data (as ACKs are never 549 // retransmitted on their own). 550 if h.active || !h.acked || h.deferAccept != 0 && h.ep.stack.Clock().NowMonotonic().Sub(h.startTime) > h.deferAccept { 551 h.ep.sendSynTCP(h.ep.route, tcpFields{ 552 id: h.ep.TransportEndpointInfo.ID, 553 ttl: h.ep.ttl, 554 tos: h.ep.sendTOS, 555 flags: h.flags, 556 seq: h.iss, 557 ack: h.ackNum, 558 rcvWnd: h.rcvWnd, 559 }, h.sendSYNOpts) 560 } 561 562 case wakerForNotification: 563 n := h.ep.fetchNotifications() 564 if (n¬ifyClose)|(n¬ifyAbort) != 0 { 565 return &tcpip.ErrAborted{} 566 } 567 if n¬ifyDrain != 0 { 568 for !h.ep.segmentQueue.empty() { 569 s := h.ep.segmentQueue.dequeue() 570 err := h.handleSegment(s) 571 s.decRef() 572 if err != nil { 573 return err 574 } 575 if h.state == handshakeCompleted { 576 return nil 577 } 578 } 579 close(h.ep.drainDone) 580 h.ep.mu.Unlock() 581 <-h.ep.undrain 582 h.ep.mu.Lock() 583 } 584 // Check for any ICMP errors notified to us. 585 if n¬ifyError != 0 { 586 if err := h.ep.lastErrorLocked(); err != nil { 587 return err 588 } 589 // Flag the handshake failure as aborted if the lastError is 590 // cleared because of a socket layer call. 591 return &tcpip.ErrConnectionAborted{} 592 } 593 case wakerForNewSegment: 594 if err := h.processSegments(); err != nil { 595 return err 596 } 597 } 598 } 599 600 return nil 601 } 602 603 type backoffTimer struct { 604 timeout time.Duration 605 maxTimeout time.Duration 606 t tcpip.Timer 607 } 608 609 func newBackoffTimer(clock tcpip.Clock, timeout, maxTimeout time.Duration, f func()) (*backoffTimer, tcpip.Error) { 610 if timeout > maxTimeout { 611 return nil, &tcpip.ErrTimeout{} 612 } 613 bt := &backoffTimer{timeout: timeout, maxTimeout: maxTimeout} 614 bt.t = clock.AfterFunc(timeout, f) 615 return bt, nil 616 } 617 618 func (bt *backoffTimer) reset() tcpip.Error { 619 bt.timeout *= 2 620 if bt.timeout > bt.maxTimeout { 621 return &tcpip.ErrTimeout{} 622 } 623 bt.t.Reset(bt.timeout) 624 return nil 625 } 626 627 func (bt *backoffTimer) stop() { 628 bt.t.Stop() 629 } 630 631 func parseSynSegmentOptions(s *segment) header.TCPSynOptions { 632 synOpts := header.ParseSynOptions(s.options, s.flags.Contains(header.TCPFlagAck)) 633 if synOpts.TS { 634 s.parsedOptions.TSVal = synOpts.TSVal 635 s.parsedOptions.TSEcr = synOpts.TSEcr 636 } 637 return synOpts 638 } 639 640 var optionPool = sync.Pool{ 641 New: func() interface{} { 642 return &[maxOptionSize]byte{} 643 }, 644 } 645 646 func getOptions() []byte { 647 return (*optionPool.Get().(*[maxOptionSize]byte))[:] 648 } 649 650 func putOptions(options []byte) { 651 // Reslice to full capacity. 652 optionPool.Put(optionsToArray(options)) 653 } 654 655 func makeSynOptions(opts header.TCPSynOptions) []byte { 656 // Emulate linux option order. This is as follows: 657 // 658 // if md5: NOP NOP MD5SIG 18 md5sig(16) 659 // if mss: MSS 4 mss(2) 660 // if ts and sack_advertise: 661 // SACK 2 TIMESTAMP 2 timestamp(8) 662 // elif ts: NOP NOP TIMESTAMP 10 timestamp(8) 663 // elif sack: NOP NOP SACK 2 664 // if wscale: NOP WINDOW 3 ws(1) 665 // if sack_blocks: NOP NOP SACK ((2 + (#blocks * 8)) 666 // [for each block] start_seq(4) end_seq(4) 667 // if fastopen_cookie: 668 // if exp: EXP (4 + len(cookie)) FASTOPEN_MAGIC(2) 669 // else: FASTOPEN (2 + len(cookie)) 670 // cookie(variable) [padding to four bytes] 671 // 672 options := getOptions() 673 674 // Always encode the mss. 675 offset := header.EncodeMSSOption(uint32(opts.MSS), options) 676 677 // Special ordering is required here. If both TS and SACK are enabled, 678 // then the SACK option precedes TS, with no padding. If they are 679 // enabled individually, then we see padding before the option. 680 if opts.TS && opts.SACKPermitted { 681 offset += header.EncodeSACKPermittedOption(options[offset:]) 682 offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:]) 683 } else if opts.TS { 684 offset += header.EncodeNOP(options[offset:]) 685 offset += header.EncodeNOP(options[offset:]) 686 offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:]) 687 } else if opts.SACKPermitted { 688 offset += header.EncodeNOP(options[offset:]) 689 offset += header.EncodeNOP(options[offset:]) 690 offset += header.EncodeSACKPermittedOption(options[offset:]) 691 } 692 693 // Initialize the WS option. 694 if opts.WS >= 0 { 695 offset += header.EncodeNOP(options[offset:]) 696 offset += header.EncodeWSOption(opts.WS, options[offset:]) 697 } 698 699 // Padding to the end; note that this never apply unless we add a 700 // fastopen option, we always expect the offset to remain the same. 701 if delta := header.AddTCPOptionPadding(options, offset); delta != 0 { 702 panic("unexpected option encoding") 703 } 704 705 return options[:offset] 706 } 707 708 // tcpFields is a struct to carry different parameters required by the 709 // send*TCP variant functions below. 710 type tcpFields struct { 711 id stack.TransportEndpointID 712 ttl uint8 713 tos uint8 714 flags header.TCPFlags 715 seq seqnum.Value 716 ack seqnum.Value 717 rcvWnd seqnum.Size 718 opts []byte 719 txHash uint32 720 } 721 722 func (e *endpoint) sendSynTCP(r *stack.Route, tf tcpFields, opts header.TCPSynOptions) tcpip.Error { 723 tf.opts = makeSynOptions(opts) 724 // We ignore SYN send errors and let the callers re-attempt send. 725 if err := e.sendTCP(r, tf, buffer.VectorisedView{}, stack.GSO{}); err != nil { 726 e.stats.SendErrors.SynSendToNetworkFailed.Increment() 727 } 728 putOptions(tf.opts) 729 return nil 730 } 731 732 func (e *endpoint) sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso stack.GSO) tcpip.Error { 733 tf.txHash = e.txHash 734 if err := sendTCP(r, tf, data, gso, e.owner); err != nil { 735 e.stats.SendErrors.SegmentSendToNetworkFailed.Increment() 736 return err 737 } 738 e.stats.SegmentsSent.Increment() 739 return nil 740 } 741 742 func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso stack.GSO) { 743 optLen := len(tf.opts) 744 tcp := header.TCP(pkt.TransportHeader().Push(header.TCPMinimumSize + optLen)) 745 pkt.TransportProtocolNumber = header.TCPProtocolNumber 746 tcp.Encode(&header.TCPFields{ 747 SrcPort: tf.id.LocalPort, 748 DstPort: tf.id.RemotePort, 749 SeqNum: uint32(tf.seq), 750 AckNum: uint32(tf.ack), 751 DataOffset: uint8(header.TCPMinimumSize + optLen), 752 Flags: tf.flags, 753 WindowSize: uint16(tf.rcvWnd), 754 }) 755 copy(tcp[header.TCPMinimumSize:], tf.opts) 756 757 xsum := r.PseudoHeaderChecksum(ProtocolNumber, uint16(pkt.Size())) 758 // Only calculate the checksum if offloading isn't supported. 759 if gso.Type != stack.GSONone && gso.NeedsCsum { 760 // This is called CHECKSUM_PARTIAL in the Linux kernel. We 761 // calculate a checksum of the pseudo-header and save it in the 762 // TCP header, then the kernel calculate a checksum of the 763 // header and data and get the right sum of the TCP packet. 764 tcp.SetChecksum(xsum) 765 } else if r.RequiresTXTransportChecksum() { 766 xsum = header.ChecksumCombine(xsum, pkt.Data().AsRange().Checksum()) 767 tcp.SetChecksum(^tcp.CalculateChecksum(xsum)) 768 } 769 } 770 771 func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error { 772 // We need to shallow clone the VectorisedView here as ReadToView will 773 // split the VectorisedView and Trim underlying views as it splits. Not 774 // doing the clone here will cause the underlying views of data itself 775 // to be altered. 776 data = data.Clone(nil) 777 778 optLen := len(tf.opts) 779 if tf.rcvWnd > math.MaxUint16 { 780 tf.rcvWnd = math.MaxUint16 781 } 782 783 mss := int(gso.MSS) 784 n := (data.Size() + mss - 1) / mss 785 786 size := data.Size() 787 hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen 788 var pkts stack.PacketBufferList 789 for i := 0; i < n; i++ { 790 packetSize := mss 791 if packetSize > size { 792 packetSize = size 793 } 794 size -= packetSize 795 pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ 796 ReserveHeaderBytes: hdrSize, 797 }) 798 pkt.Hash = tf.txHash 799 pkt.Owner = owner 800 pkt.Data().ReadFromVV(&data, packetSize) 801 buildTCPHdr(r, tf, pkt, gso) 802 tf.seq = tf.seq.Add(seqnum.Size(packetSize)) 803 pkt.GSOOptions = gso 804 pkts.PushBack(pkt) 805 } 806 807 if tf.ttl == 0 { 808 tf.ttl = r.DefaultTTL() 809 } 810 sent, err := r.WritePackets(pkts, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos}) 811 if err != nil { 812 r.Stats().TCP.SegmentSendErrors.IncrementBy(uint64(n - sent)) 813 } 814 r.Stats().TCP.SegmentsSent.IncrementBy(uint64(sent)) 815 return err 816 } 817 818 // sendTCP sends a TCP segment with the provided options via the provided 819 // network endpoint and under the provided identity. 820 func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error { 821 optLen := len(tf.opts) 822 if tf.rcvWnd > math.MaxUint16 { 823 tf.rcvWnd = math.MaxUint16 824 } 825 826 if r.Loop()&stack.PacketLoop == 0 && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() { 827 return sendTCPBatch(r, tf, data, gso, owner) 828 } 829 830 pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ 831 ReserveHeaderBytes: header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen, 832 Data: data, 833 }) 834 pkt.GSOOptions = gso 835 pkt.Hash = tf.txHash 836 pkt.Owner = owner 837 buildTCPHdr(r, tf, pkt, gso) 838 839 if tf.ttl == 0 { 840 tf.ttl = r.DefaultTTL() 841 } 842 if err := r.WritePacket(stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos}, pkt); err != nil { 843 r.Stats().TCP.SegmentSendErrors.Increment() 844 return err 845 } 846 r.Stats().TCP.SegmentsSent.Increment() 847 if (tf.flags & header.TCPFlagRst) != 0 { 848 r.Stats().TCP.ResetsSent.Increment() 849 } 850 return nil 851 } 852 853 // makeOptions makes an options slice. 854 func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte { 855 options := getOptions() 856 offset := 0 857 858 // N.B. the ordering here matches the ordering used by Linux internally 859 // and described in the raw makeOptions function. We don't include 860 // unnecessary cases here (post connection.) 861 if e.SendTSOk { 862 // Embed the timestamp if timestamp has been enabled. 863 // 864 // We only use the lower 32 bits of the unix time in 865 // milliseconds. This is similar to what Linux does where it 866 // uses the lower 32 bits of the jiffies value in the tsVal 867 // field of the timestamp option. 868 // 869 // Further, RFC7323 section-5.4 recommends millisecond 870 // resolution as the lowest recommended resolution for the 871 // timestamp clock. 872 // 873 // Ref: https://tools.ietf.org/html/rfc7323#section-5.4. 874 offset += header.EncodeNOP(options[offset:]) 875 offset += header.EncodeNOP(options[offset:]) 876 offset += header.EncodeTSOption(e.timestamp(), e.recentTimestamp(), options[offset:]) 877 } 878 if e.SACKPermitted && len(sackBlocks) > 0 { 879 offset += header.EncodeNOP(options[offset:]) 880 offset += header.EncodeNOP(options[offset:]) 881 offset += header.EncodeSACKBlocks(sackBlocks, options[offset:]) 882 } 883 884 // We expect the above to produce an aligned offset. 885 if delta := header.AddTCPOptionPadding(options, offset); delta != 0 { 886 panic("unexpected option encoding") 887 } 888 889 return options[:offset] 890 } 891 892 // sendRaw sends a TCP segment to the endpoint's peer. 893 func (e *endpoint) sendRaw(data buffer.VectorisedView, flags header.TCPFlags, seq, ack seqnum.Value, rcvWnd seqnum.Size) tcpip.Error { 894 var sackBlocks []header.SACKBlock 895 if e.EndpointState() == StateEstablished && e.rcv.pendingRcvdSegments.Len() > 0 && (flags&header.TCPFlagAck != 0) { 896 sackBlocks = e.sack.Blocks[:e.sack.NumBlocks] 897 } 898 options := e.makeOptions(sackBlocks) 899 err := e.sendTCP(e.route, tcpFields{ 900 id: e.TransportEndpointInfo.ID, 901 ttl: e.ttl, 902 tos: e.sendTOS, 903 flags: flags, 904 seq: seq, 905 ack: ack, 906 rcvWnd: rcvWnd, 907 opts: options, 908 }, data, e.gso) 909 putOptions(options) 910 return err 911 } 912 913 // Precondition: e.mu must be locked. 914 func (e *endpoint) sendData(next *segment) { 915 // Initialize the next segment to write if it's currently nil. 916 if e.snd.writeNext == nil { 917 if next == nil { 918 return 919 } 920 e.snd.writeNext = next 921 } 922 923 // Push out any new packets. 924 e.snd.sendData() 925 } 926 927 // resetConnectionLocked puts the endpoint in an error state with the given 928 // error code and sends a RST if and only if the error is not ErrConnectionReset 929 // indicating that the connection is being reset due to receiving a RST. This 930 // method must only be called from the protocol goroutine. 931 func (e *endpoint) resetConnectionLocked(err tcpip.Error) { 932 // Only send a reset if the connection is being aborted for a reason 933 // other than receiving a reset. 934 e.setEndpointState(StateError) 935 e.hardError = err 936 switch err.(type) { 937 case *tcpip.ErrConnectionReset, *tcpip.ErrTimeout: 938 default: 939 // The exact sequence number to be used for the RST is the same as the 940 // one used by Linux. We need to handle the case of window being shrunk 941 // which can cause sndNxt to be outside the acceptable window on the 942 // receiver. 943 // 944 // See: https://www.snellman.net/blog/archive/2016-02-01-tcp-rst/ for more 945 // information. 946 sndWndEnd := e.snd.SndUna.Add(e.snd.SndWnd) 947 resetSeqNum := sndWndEnd 948 if !sndWndEnd.LessThan(e.snd.SndNxt) || e.snd.SndNxt.Size(sndWndEnd) < (1<<e.snd.SndWndScale) { 949 resetSeqNum = e.snd.SndNxt 950 } 951 e.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck|header.TCPFlagRst, resetSeqNum, e.rcv.RcvNxt, 0) 952 } 953 } 954 955 // completeWorkerLocked is called by the worker goroutine when it's about to 956 // exit. 957 func (e *endpoint) completeWorkerLocked() { 958 // Worker is terminating(either due to moving to 959 // CLOSED or ERROR state, ensure we release all 960 // registrations port reservations even if the socket 961 // itself is not yet closed by the application. 962 e.workerRunning = false 963 if e.workerCleanup { 964 e.cleanupLocked() 965 } 966 } 967 968 // transitionToStateEstablisedLocked transitions a given endpoint 969 // to an established state using the handshake parameters provided. 970 // It also initializes sender/receiver. 971 func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) { 972 // Transfer handshake state to TCP connection. We disable 973 // receive window scaling if the peer doesn't support it 974 // (indicated by a negative send window scale). 975 e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale) 976 977 e.rcvQueueInfo.rcvQueueMu.Lock() 978 e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale()) 979 // Bootstrap the auto tuning algorithm. Starting at zero will 980 // result in a really large receive window after the first auto 981 // tuning adjustment. 982 e.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes = int(h.rcvWnd) 983 e.rcvQueueInfo.rcvQueueMu.Unlock() 984 985 e.setEndpointState(StateEstablished) 986 } 987 988 // transitionToStateCloseLocked ensures that the endpoint is 989 // cleaned up from the transport demuxer, "before" moving to 990 // StateClose. This will ensure that no packet will be 991 // delivered to this endpoint from the demuxer when the endpoint 992 // is transitioned to StateClose. 993 func (e *endpoint) transitionToStateCloseLocked() { 994 s := e.EndpointState() 995 if s == StateClose { 996 return 997 } 998 999 if s.connected() { 1000 e.stack.Stats().TCP.CurrentConnected.Decrement() 1001 e.stack.Stats().TCP.EstablishedClosed.Increment() 1002 } 1003 1004 // Mark the endpoint as fully closed for reads/writes. 1005 e.cleanupLocked() 1006 e.setEndpointState(StateClose) 1007 } 1008 1009 // tryDeliverSegmentFromClosedEndpoint attempts to deliver the parsed 1010 // segment to any other endpoint other than the current one. This is called 1011 // only when the endpoint is in StateClose and we want to deliver the segment 1012 // to any other listening endpoint. We reply with RST if we cannot find one. 1013 func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) { 1014 ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.TransportEndpointInfo.ID, s.nicID) 1015 if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.TransportEndpointInfo.ID.LocalAddress.To4() != "" { 1016 // Dual-stack socket, try IPv4. 1017 ep = e.stack.FindTransportEndpoint( 1018 header.IPv4ProtocolNumber, 1019 e.TransProto, 1020 e.TransportEndpointInfo.ID, 1021 s.nicID, 1022 ) 1023 } 1024 if ep == nil { 1025 replyWithReset(e.stack, s, stack.DefaultTOS, 0 /* ttl */) 1026 s.decRef() 1027 return 1028 } 1029 1030 if e == ep { 1031 panic("current endpoint not removed from demuxer, enqueing segments to itself") 1032 } 1033 1034 if ep := ep.(*endpoint); ep.enqueueSegment(s) { 1035 ep.newSegmentWaker.Assert() 1036 } 1037 } 1038 1039 // Drain segment queue from the endpoint and try to re-match the segment to a 1040 // different endpoint. This is used when the current endpoint is transitioned to 1041 // StateClose and has been unregistered from the transport demuxer. 1042 func (e *endpoint) drainClosingSegmentQueue() { 1043 for { 1044 s := e.segmentQueue.dequeue() 1045 if s == nil { 1046 break 1047 } 1048 1049 e.tryDeliverSegmentFromClosedEndpoint(s) 1050 } 1051 } 1052 1053 func (e *endpoint) handleReset(s *segment) (ok bool, err tcpip.Error) { 1054 if e.rcv.acceptable(s.sequenceNumber, 0) { 1055 // RFC 793, page 37 states that "in all states 1056 // except SYN-SENT, all reset (RST) segments are 1057 // validated by checking their SEQ-fields." So 1058 // we only process it if it's acceptable. 1059 switch e.EndpointState() { 1060 // In case of a RST in CLOSE-WAIT linux moves 1061 // the socket to closed state with an error set 1062 // to indicate EPIPE. 1063 // 1064 // Technically this seems to be at odds w/ RFC. 1065 // As per https://tools.ietf.org/html/rfc793#section-2.7 1066 // page 69 the behavior for a segment arriving 1067 // w/ RST bit set in CLOSE-WAIT is inlined below. 1068 // 1069 // ESTABLISHED 1070 // FIN-WAIT-1 1071 // FIN-WAIT-2 1072 // CLOSE-WAIT 1073 1074 // If the RST bit is set then, any outstanding RECEIVEs and 1075 // SEND should receive "reset" responses. All segment queues 1076 // should be flushed. Users should also receive an unsolicited 1077 // general "connection reset" signal. Enter the CLOSED state, 1078 // delete the TCB, and return. 1079 case StateCloseWait: 1080 e.transitionToStateCloseLocked() 1081 e.hardError = &tcpip.ErrAborted{} 1082 e.notifyProtocolGoroutine(notifyTickleWorker) 1083 return false, nil 1084 default: 1085 // RFC 793, page 37 states that "in all states 1086 // except SYN-SENT, all reset (RST) segments are 1087 // validated by checking their SEQ-fields." So 1088 // we only process it if it's acceptable. 1089 1090 // Notify protocol goroutine. This is required when 1091 // handleSegment is invoked from the processor goroutine 1092 // rather than the worker goroutine. 1093 e.notifyProtocolGoroutine(notifyResetByPeer) 1094 return false, &tcpip.ErrConnectionReset{} 1095 } 1096 } 1097 return true, nil 1098 } 1099 1100 // handleSegments processes all inbound segments. 1101 // 1102 // Precondition: e.mu must be held. 1103 func (e *endpoint) handleSegmentsLocked(fastPath bool) tcpip.Error { 1104 checkRequeue := true 1105 for i := 0; i < maxSegmentsPerWake; i++ { 1106 if state := e.EndpointState(); state.closed() || state == StateTimeWait { 1107 return nil 1108 } 1109 s := e.segmentQueue.dequeue() 1110 if s == nil { 1111 checkRequeue = false 1112 break 1113 } 1114 1115 cont, err := e.handleSegmentLocked(s) 1116 s.decRef() 1117 if err != nil { 1118 return err 1119 } 1120 if !cont { 1121 return nil 1122 } 1123 } 1124 1125 // When fastPath is true we don't want to wake up the worker 1126 // goroutine. If the endpoint has more segments to process the 1127 // dispatcher will call handleSegments again anyway. 1128 if !fastPath && checkRequeue && !e.segmentQueue.empty() { 1129 e.newSegmentWaker.Assert() 1130 } 1131 1132 // Send an ACK for all processed packets if needed. 1133 if e.rcv.RcvNxt != e.snd.MaxSentAck { 1134 e.snd.sendAck() 1135 } 1136 1137 e.resetKeepaliveTimer(true /* receivedData */) 1138 1139 return nil 1140 } 1141 1142 // Precondition: e.mu must be held. 1143 func (e *endpoint) probeSegmentLocked() { 1144 if fn := e.probe; fn != nil { 1145 fn(e.completeStateLocked()) 1146 } 1147 } 1148 1149 // handleSegment handles a given segment and notifies the worker goroutine if 1150 // if the connection should be terminated. 1151 // 1152 // Precondition: e.mu must be held. 1153 func (e *endpoint) handleSegmentLocked(s *segment) (cont bool, err tcpip.Error) { 1154 // Invoke the tcp probe if installed. The tcp probe function will update 1155 // the TCPEndpointState after the segment is processed. 1156 defer e.probeSegmentLocked() 1157 1158 if s.flags.Contains(header.TCPFlagRst) { 1159 if ok, err := e.handleReset(s); !ok { 1160 return false, err 1161 } 1162 } else if s.flags.Contains(header.TCPFlagSyn) { 1163 // See: https://tools.ietf.org/html/rfc5961#section-4.1 1164 // 1) If the SYN bit is set, irrespective of the sequence number, TCP 1165 // MUST send an ACK (also referred to as challenge ACK) to the remote 1166 // peer: 1167 // 1168 // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK> 1169 // 1170 // After sending the acknowledgment, TCP MUST drop the unacceptable 1171 // segment and stop processing further. 1172 // 1173 // By sending an ACK, the remote peer is challenged to confirm the loss 1174 // of the previous connection and the request to start a new connection. 1175 // A legitimate peer, after restart, would not have a TCB in the 1176 // synchronized state. Thus, when the ACK arrives, the peer should send 1177 // a RST segment back with the sequence number derived from the ACK 1178 // field that caused the RST. 1179 1180 // This RST will confirm that the remote peer has indeed closed the 1181 // previous connection. Upon receipt of a valid RST, the local TCP 1182 // endpoint MUST terminate its connection. The local TCP endpoint 1183 // should then rely on SYN retransmission from the remote end to 1184 // re-establish the connection. 1185 e.snd.maybeSendOutOfWindowAck(s) 1186 } else if s.flags.Contains(header.TCPFlagAck) { 1187 // Patch the window size in the segment according to the 1188 // send window scale. 1189 s.window <<= e.snd.SndWndScale 1190 1191 // RFC 793, page 41 states that "once in the ESTABLISHED 1192 // state all segments must carry current acknowledgment 1193 // information." 1194 drop, err := e.rcv.handleRcvdSegment(s) 1195 if err != nil { 1196 return false, err 1197 } 1198 if drop { 1199 return true, nil 1200 } 1201 1202 // Now check if the received segment has caused us to transition 1203 // to a CLOSED state, if yes then terminate processing and do 1204 // not invoke the sender. 1205 state := e.EndpointState() 1206 if state == StateClose { 1207 // When we get into StateClose while processing from the queue, 1208 // return immediately and let the protocolMainloop handle it. 1209 // 1210 // We can reach StateClose only while processing a previous segment 1211 // or a notification from the protocolMainLoop (caller goroutine). 1212 // This means that with this return, the segment dequeue below can 1213 // never occur on a closed endpoint. 1214 return false, nil 1215 } 1216 1217 e.snd.handleRcvdSegment(s) 1218 } 1219 1220 return true, nil 1221 } 1222 1223 // keepaliveTimerExpired is called when the keepaliveTimer fires. We send TCP 1224 // keepalive packets periodically when the connection is idle. If we don't hear 1225 // from the other side after a number of tries, we terminate the connection. 1226 func (e *endpoint) keepaliveTimerExpired() tcpip.Error { 1227 userTimeout := e.userTimeout 1228 1229 e.keepalive.Lock() 1230 if !e.SocketOptions().GetKeepAlive() || !e.keepalive.timer.checkExpiration() { 1231 e.keepalive.Unlock() 1232 return nil 1233 } 1234 1235 // If a userTimeout is set then abort the connection if it is 1236 // exceeded. 1237 if userTimeout != 0 && e.stack.Clock().NowMonotonic().Sub(e.rcv.lastRcvdAckTime) >= userTimeout && e.keepalive.unacked > 0 { 1238 e.keepalive.Unlock() 1239 e.stack.Stats().TCP.EstablishedTimedout.Increment() 1240 return &tcpip.ErrTimeout{} 1241 } 1242 1243 if e.keepalive.unacked >= e.keepalive.count { 1244 e.keepalive.Unlock() 1245 e.stack.Stats().TCP.EstablishedTimedout.Increment() 1246 return &tcpip.ErrTimeout{} 1247 } 1248 1249 // RFC1122 4.2.3.6: TCP keepalive is a dataless ACK with 1250 // seg.seq = snd.nxt-1. 1251 e.keepalive.unacked++ 1252 e.keepalive.Unlock() 1253 e.snd.sendSegmentFromView(buffer.VectorisedView{}, header.TCPFlagAck, e.snd.SndNxt-1) 1254 e.resetKeepaliveTimer(false) 1255 return nil 1256 } 1257 1258 // resetKeepaliveTimer restarts or stops the keepalive timer, depending on 1259 // whether it is enabled for this endpoint. 1260 func (e *endpoint) resetKeepaliveTimer(receivedData bool) { 1261 e.keepalive.Lock() 1262 if receivedData { 1263 e.keepalive.unacked = 0 1264 } 1265 // Start the keepalive timer IFF it's enabled and there is no pending 1266 // data to send. 1267 if !e.SocketOptions().GetKeepAlive() || e.snd == nil || e.snd.SndUna != e.snd.SndNxt { 1268 e.keepalive.timer.disable() 1269 e.keepalive.Unlock() 1270 return 1271 } 1272 if e.keepalive.unacked > 0 { 1273 e.keepalive.timer.enable(e.keepalive.interval) 1274 } else { 1275 e.keepalive.timer.enable(e.keepalive.idle) 1276 } 1277 e.keepalive.Unlock() 1278 } 1279 1280 // disableKeepaliveTimer stops the keepalive timer. 1281 func (e *endpoint) disableKeepaliveTimer() { 1282 e.keepalive.Lock() 1283 e.keepalive.timer.disable() 1284 e.keepalive.Unlock() 1285 } 1286 1287 // protocolMainLoopDone is called at the end of protocolMainLoop. 1288 // +checklocksrelease:e.mu 1289 func (e *endpoint) protocolMainLoopDone(closeTimer tcpip.Timer, closeWaker *sleep.Waker) { 1290 if e.snd != nil { 1291 e.snd.resendTimer.cleanup() 1292 e.snd.probeTimer.cleanup() 1293 e.snd.reorderTimer.cleanup() 1294 } 1295 1296 if closeTimer != nil { 1297 closeTimer.Stop() 1298 } 1299 1300 e.completeWorkerLocked() 1301 1302 if e.drainDone != nil { 1303 close(e.drainDone) 1304 } 1305 1306 e.mu.Unlock() 1307 1308 e.drainClosingSegmentQueue() 1309 1310 // When the protocol loop exits we should wake up our waiters. 1311 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 1312 } 1313 1314 // protocolMainLoop is the main loop of the TCP protocol. It runs in its own 1315 // goroutine and is responsible for sending segments and handling received 1316 // segments. 1317 func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) tcpip.Error { 1318 var ( 1319 closeTimer tcpip.Timer 1320 closeWaker sleep.Waker 1321 ) 1322 1323 e.mu.Lock() 1324 if handshake { 1325 if err := e.h.complete(); err != nil { // +checklocksforce 1326 e.lastErrorMu.Lock() 1327 e.lastError = err 1328 e.lastErrorMu.Unlock() 1329 1330 e.setEndpointState(StateError) 1331 e.hardError = err 1332 1333 e.workerCleanup = true 1334 e.protocolMainLoopDone(closeTimer, &closeWaker) 1335 return err 1336 } 1337 } 1338 1339 // Reaching this point means that we successfully completed the 3-way 1340 // handshake with our peer. The current endpoint state could be any state 1341 // post ESTABLISHED, including CLOSED or ERROR if the endpoint processes a 1342 // RST from the peer via the dispatcher fast path, before the loop is 1343 // started. 1344 if s := e.EndpointState(); !s.connected() { 1345 switch s { 1346 case StateClose, StateError: 1347 // If the endpoint is in CLOSED/ERROR state, sender state has to be 1348 // initialized if the endpoint was previously established. 1349 if e.snd != nil { 1350 break 1351 } 1352 fallthrough 1353 default: 1354 panic("endpoint was not established, current state " + s.String()) 1355 } 1356 } 1357 1358 // Completing the 3-way handshake is an indication that the route is valid 1359 // and the remote is reachable as the only way we can complete a handshake 1360 // is if our SYN reached the remote and their ACK reached us. 1361 e.route.ConfirmReachable() 1362 1363 drained := e.drainDone != nil 1364 if drained { 1365 close(e.drainDone) 1366 e.mu.Unlock() 1367 <-e.undrain 1368 e.mu.Lock() 1369 } 1370 1371 // Set up the functions that will be called when the main protocol loop 1372 // wakes up. 1373 funcs := []struct { 1374 w *sleep.Waker 1375 f func() tcpip.Error 1376 }{ 1377 { 1378 w: &e.sndQueueInfo.sndWaker, 1379 f: func() tcpip.Error { 1380 e.sendData(nil /* next */) 1381 return nil 1382 }, 1383 }, 1384 { 1385 w: &closeWaker, 1386 f: func() tcpip.Error { 1387 // This means the socket is being closed due 1388 // to the TCP-FIN-WAIT2 timeout was hit. Just 1389 // mark the socket as closed. 1390 e.transitionToStateCloseLocked() 1391 e.workerCleanup = true 1392 return nil 1393 }, 1394 }, 1395 { 1396 w: &e.snd.resendWaker, 1397 f: func() tcpip.Error { 1398 if !e.snd.retransmitTimerExpired() { 1399 e.stack.Stats().TCP.EstablishedTimedout.Increment() 1400 return &tcpip.ErrTimeout{} 1401 } 1402 return nil 1403 }, 1404 }, 1405 { 1406 w: &e.snd.probeWaker, 1407 f: e.snd.probeTimerExpired, 1408 }, 1409 { 1410 w: &e.newSegmentWaker, 1411 f: func() tcpip.Error { 1412 return e.handleSegmentsLocked(false /* fastPath */) 1413 }, 1414 }, 1415 { 1416 w: &e.keepalive.waker, 1417 f: e.keepaliveTimerExpired, 1418 }, 1419 { 1420 w: &e.notificationWaker, 1421 f: func() tcpip.Error { 1422 n := e.fetchNotifications() 1423 if n¬ifyNonZeroReceiveWindow != 0 { 1424 e.rcv.nonZeroWindow() 1425 } 1426 1427 if n¬ifyMTUChanged != 0 { 1428 e.sndQueueInfo.sndQueueMu.Lock() 1429 count := e.sndQueueInfo.PacketTooBigCount 1430 e.sndQueueInfo.PacketTooBigCount = 0 1431 mtu := e.sndQueueInfo.SndMTU 1432 e.sndQueueInfo.sndQueueMu.Unlock() 1433 1434 e.snd.updateMaxPayloadSize(mtu, count) 1435 } 1436 1437 if n¬ifyReset != 0 || n¬ifyAbort != 0 { 1438 return &tcpip.ErrConnectionAborted{} 1439 } 1440 1441 if n¬ifyResetByPeer != 0 { 1442 return &tcpip.ErrConnectionReset{} 1443 } 1444 1445 if n¬ifyClose != 0 && e.closed { 1446 switch e.EndpointState() { 1447 case StateEstablished: 1448 // Perform full shutdown if the endpoint is still 1449 // established. This can occur when notifyClose 1450 // was asserted just before becoming established. 1451 e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead) 1452 case StateFinWait2: 1453 // The socket has been closed and we are in FIN_WAIT2 1454 // so start the FIN_WAIT2 timer. 1455 if closeTimer == nil { 1456 closeTimer = e.stack.Clock().AfterFunc(e.tcpLingerTimeout, closeWaker.Assert) 1457 } 1458 } 1459 } 1460 1461 if n¬ifyKeepaliveChanged != 0 { 1462 // The timer could fire in background 1463 // when the endpoint is drained. That's 1464 // OK. See above. 1465 e.resetKeepaliveTimer(true) 1466 } 1467 1468 if n¬ifyDrain != 0 { 1469 for !e.segmentQueue.empty() { 1470 if err := e.handleSegmentsLocked(false /* fastPath */); err != nil { 1471 return err 1472 } 1473 } 1474 if !e.EndpointState().closed() { 1475 // Only block the worker if the endpoint 1476 // is not in closed state or error state. 1477 close(e.drainDone) 1478 e.mu.Unlock() // +checklocksforce 1479 <-e.undrain 1480 e.mu.Lock() 1481 } 1482 } 1483 1484 if n¬ifyTickleWorker != 0 { 1485 // Just a tickle notification. No need to do 1486 // anything. 1487 return nil 1488 } 1489 1490 return nil 1491 }, 1492 }, 1493 { 1494 w: &e.snd.reorderWaker, 1495 f: e.snd.rc.reorderTimerExpired, 1496 }, 1497 } 1498 1499 // Initialize the sleeper based on the wakers in funcs. 1500 var s sleep.Sleeper 1501 for i := range funcs { 1502 s.AddWaker(funcs[i].w, i) 1503 } 1504 1505 // Notify the caller that the waker initialization is complete and the 1506 // endpoint is ready. 1507 if wakerInitDone != nil { 1508 close(wakerInitDone) 1509 } 1510 1511 // Tell waiters that the endpoint is connected and writable. 1512 e.waiterQueue.Notify(waiter.WritableEvents) 1513 1514 // The following assertions and notifications are needed for restored 1515 // endpoints. Fresh newly created endpoints have empty states and should 1516 // not invoke any. 1517 if !e.segmentQueue.empty() { 1518 e.newSegmentWaker.Assert() 1519 } 1520 1521 e.rcvQueueInfo.rcvQueueMu.Lock() 1522 if !e.rcvQueueInfo.rcvQueue.Empty() { 1523 e.waiterQueue.Notify(waiter.ReadableEvents) 1524 } 1525 e.rcvQueueInfo.rcvQueueMu.Unlock() 1526 1527 if e.workerCleanup { 1528 e.notifyProtocolGoroutine(notifyClose) 1529 } 1530 1531 // Main loop. Handle segments until both send and receive ends of the 1532 // connection have completed. 1533 cleanupOnError := func(err tcpip.Error) { 1534 e.stack.Stats().TCP.CurrentConnected.Decrement() 1535 e.workerCleanup = true 1536 if err != nil { 1537 e.resetConnectionLocked(err) 1538 } 1539 } 1540 1541 loop: 1542 for { 1543 switch e.EndpointState() { 1544 case StateTimeWait, StateClose, StateError: 1545 break loop 1546 } 1547 1548 e.mu.Unlock() 1549 v, _ := s.Fetch(true /* block */) 1550 e.mu.Lock() 1551 1552 // We need to double check here because the notification may be 1553 // stale by the time we got around to processing it. 1554 switch e.EndpointState() { 1555 case StateError: 1556 // If the endpoint has already transitioned to an ERROR 1557 // state just pass nil here as any reset that may need 1558 // to be sent etc should already have been done and we 1559 // just want to terminate the loop and cleanup the 1560 // endpoint. 1561 cleanupOnError(nil) 1562 e.protocolMainLoopDone(closeTimer, &closeWaker) 1563 return nil 1564 case StateTimeWait: 1565 fallthrough 1566 case StateClose: 1567 break loop 1568 default: 1569 if err := funcs[v].f(); err != nil { 1570 cleanupOnError(err) 1571 e.protocolMainLoopDone(closeTimer, &closeWaker) 1572 return nil 1573 } 1574 } 1575 } 1576 1577 var reuseTW func() 1578 if e.EndpointState() == StateTimeWait { 1579 // Disable close timer as we now entering real TIME_WAIT. 1580 if closeTimer != nil { 1581 closeTimer.Stop() 1582 } 1583 // Mark the current sleeper done so as to free all associated 1584 // wakers. 1585 s.Done() 1586 // Wake up any waiters before we enter TIME_WAIT. 1587 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 1588 e.workerCleanup = true 1589 reuseTW = e.doTimeWait() 1590 } 1591 1592 // Handle any StateError transition from StateTimeWait. 1593 if e.EndpointState() == StateError { 1594 cleanupOnError(nil) 1595 e.protocolMainLoopDone(closeTimer, &closeWaker) 1596 return nil 1597 } 1598 1599 e.transitionToStateCloseLocked() 1600 1601 e.protocolMainLoopDone(closeTimer, &closeWaker) 1602 1603 // A new SYN was received during TIME_WAIT and we need to abort 1604 // the timewait and redirect the segment to the listener queue 1605 if reuseTW != nil { 1606 reuseTW() 1607 } 1608 1609 return nil 1610 } 1611 1612 // handleTimeWaitSegments processes segments received during TIME_WAIT 1613 // state. 1614 func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()) { 1615 checkRequeue := true 1616 for i := 0; i < maxSegmentsPerWake; i++ { 1617 s := e.segmentQueue.dequeue() 1618 if s == nil { 1619 checkRequeue = false 1620 break 1621 } 1622 extTW, newSyn := e.rcv.handleTimeWaitSegment(s) 1623 if newSyn { 1624 info := e.TransportEndpointInfo 1625 newID := info.ID 1626 newID.RemoteAddress = "" 1627 newID.RemotePort = 0 1628 netProtos := []tcpip.NetworkProtocolNumber{info.NetProto} 1629 // If the local address is an IPv4 address then also 1630 // look for IPv6 dual stack endpoints that might be 1631 // listening on the local address. 1632 if newID.LocalAddress.To4() != "" { 1633 netProtos = []tcpip.NetworkProtocolNumber{header.IPv4ProtocolNumber, header.IPv6ProtocolNumber} 1634 } 1635 for _, netProto := range netProtos { 1636 if listenEP := e.stack.FindTransportEndpoint(netProto, info.TransProto, newID, s.nicID); listenEP != nil { 1637 tcpEP := listenEP.(*endpoint) 1638 if EndpointState(tcpEP.State()) == StateListen { 1639 reuseTW = func() { 1640 if !tcpEP.enqueueSegment(s) { 1641 s.decRef() 1642 return 1643 } 1644 tcpEP.newSegmentWaker.Assert() 1645 } 1646 // We explicitly do not decRef 1647 // the segment as it's still 1648 // valid and being reflected to 1649 // a listening endpoint. 1650 return false, reuseTW 1651 } 1652 } 1653 } 1654 } 1655 if extTW { 1656 extendTimeWait = true 1657 } 1658 s.decRef() 1659 } 1660 if checkRequeue && !e.segmentQueue.empty() { 1661 e.newSegmentWaker.Assert() 1662 } 1663 return extendTimeWait, nil 1664 } 1665 1666 // doTimeWait is responsible for handling the TCP behaviour once a socket 1667 // enters the TIME_WAIT state. Optionally it can return a closure that 1668 // should be executed after releasing the endpoint registrations. This is 1669 // done in cases where a new SYN is received during TIME_WAIT that carries 1670 // a sequence number larger than one see on the connection. 1671 // +checklocks:e.mu 1672 func (e *endpoint) doTimeWait() (twReuse func()) { 1673 // Trigger a 2 * MSL time wait state. During this period 1674 // we will drop all incoming segments. 1675 // NOTE: On Linux this is not configurable and is fixed at 60 seconds. 1676 timeWaitDuration := DefaultTCPTimeWaitTimeout 1677 1678 // Get the stack wide configuration. 1679 var tcpTW tcpip.TCPTimeWaitTimeoutOption 1680 if err := e.stack.TransportProtocolOption(ProtocolNumber, &tcpTW); err == nil { 1681 timeWaitDuration = time.Duration(tcpTW) 1682 } 1683 1684 const newSegment = 1 1685 const notification = 2 1686 const timeWaitDone = 3 1687 1688 var s sleep.Sleeper 1689 defer s.Done() 1690 s.AddWaker(&e.newSegmentWaker, newSegment) 1691 s.AddWaker(&e.notificationWaker, notification) 1692 1693 var timeWaitWaker sleep.Waker 1694 s.AddWaker(&timeWaitWaker, timeWaitDone) 1695 timeWaitTimer := e.stack.Clock().AfterFunc(timeWaitDuration, timeWaitWaker.Assert) 1696 defer timeWaitTimer.Stop() 1697 1698 for { 1699 e.mu.Unlock() 1700 v, _ := s.Fetch(true /* block */) 1701 e.mu.Lock() 1702 switch v { 1703 case newSegment: 1704 extendTimeWait, reuseTW := e.handleTimeWaitSegments() 1705 if reuseTW != nil { 1706 return reuseTW 1707 } 1708 if extendTimeWait { 1709 timeWaitTimer.Reset(timeWaitDuration) 1710 } 1711 case notification: 1712 n := e.fetchNotifications() 1713 if n¬ifyAbort != 0 { 1714 return nil 1715 } 1716 if n¬ifyDrain != 0 { 1717 for !e.segmentQueue.empty() { 1718 // Ignore extending TIME_WAIT during a 1719 // save. For sockets in TIME_WAIT we just 1720 // terminate the TIME_WAIT early. 1721 e.handleTimeWaitSegments() 1722 } 1723 close(e.drainDone) 1724 e.mu.Unlock() 1725 <-e.undrain 1726 e.mu.Lock() 1727 return nil 1728 } 1729 case timeWaitDone: 1730 return nil 1731 } 1732 } 1733 }