github.com/polevpn/netstack@v1.10.9/tcpip/transport/tcp/connect.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "encoding/binary" 19 "sync" 20 "time" 21 22 "github.com/polevpn/netstack/rand" 23 "github.com/polevpn/netstack/sleep" 24 "github.com/polevpn/netstack/tcpip" 25 "github.com/polevpn/netstack/tcpip/buffer" 26 "github.com/polevpn/netstack/tcpip/hash/jenkins" 27 "github.com/polevpn/netstack/tcpip/header" 28 "github.com/polevpn/netstack/tcpip/seqnum" 29 "github.com/polevpn/netstack/tcpip/stack" 30 "github.com/polevpn/netstack/waiter" 31 ) 32 33 // maxSegmentsPerWake is the maximum number of segments to process in the main 34 // protocol goroutine per wake-up. Yielding [after this number of segments are 35 // processed] allows other events to be processed as well (e.g., timeouts, 36 // resets, etc.). 37 const maxSegmentsPerWake = 100 38 39 type handshakeState int 40 41 // The following are the possible states of the TCP connection during a 3-way 42 // handshake. A depiction of the states and transitions can be found in RFC 793, 43 // page 23. 44 const ( 45 handshakeSynSent handshakeState = iota 46 handshakeSynRcvd 47 handshakeCompleted 48 ) 49 50 // The following are used to set up sleepers. 51 const ( 52 wakerForNotification = iota 53 wakerForNewSegment 54 wakerForResend 55 wakerForResolution 56 ) 57 58 const ( 59 // Maximum space available for options. 60 maxOptionSize = 40 61 ) 62 63 // handshake holds the state used during a TCP 3-way handshake. 64 type handshake struct { 65 ep *endpoint 66 state handshakeState 67 active bool 68 flags uint8 69 ackNum seqnum.Value 70 71 // iss is the initial send sequence number, as defined in RFC 793. 72 iss seqnum.Value 73 74 // rcvWnd is the receive window, as defined in RFC 793. 75 rcvWnd seqnum.Size 76 77 // sndWnd is the send window, as defined in RFC 793. 78 sndWnd seqnum.Size 79 80 // mss is the maximum segment size received from the peer. 81 mss uint16 82 83 // sndWndScale is the send window scale, as defined in RFC 1323. A 84 // negative value means no scaling is supported by the peer. 85 sndWndScale int 86 87 // rcvWndScale is the receive window scale, as defined in RFC 1323. 88 rcvWndScale int 89 } 90 91 func newHandshake(ep *endpoint, rcvWnd seqnum.Size) handshake { 92 rcvWndScale := ep.rcvWndScaleForHandshake() 93 94 // Round-down the rcvWnd to a multiple of wndScale. This ensures that the 95 // window offered in SYN won't be reduced due to the loss of precision if 96 // window scaling is enabled after the handshake. 97 rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale) 98 99 // Ensure we can always accept at least 1 byte if the scale specified 100 // was too high for the provided rcvWnd. 101 if rcvWnd == 0 { 102 rcvWnd = 1 103 } 104 105 h := handshake{ 106 ep: ep, 107 active: true, 108 rcvWnd: rcvWnd, 109 rcvWndScale: int(rcvWndScale), 110 } 111 h.resetState() 112 return h 113 } 114 115 // FindWndScale determines the window scale to use for the given maximum window 116 // size. 117 func FindWndScale(wnd seqnum.Size) int { 118 if wnd < 0x10000 { 119 return 0 120 } 121 122 max := seqnum.Size(0xffff) 123 s := 0 124 for wnd > max && s < header.MaxWndScale { 125 s++ 126 max <<= 1 127 } 128 129 return s 130 } 131 132 // resetState resets the state of the handshake object such that it becomes 133 // ready for a new 3-way handshake. 134 func (h *handshake) resetState() { 135 b := make([]byte, 4) 136 if _, err := rand.Read(b); err != nil { 137 panic(err) 138 } 139 140 h.state = handshakeSynSent 141 h.flags = header.TCPFlagSyn 142 h.ackNum = 0 143 h.mss = 0 144 h.iss = generateSecureISN(h.ep.ID, h.ep.stack.Seed()) 145 } 146 147 // generateSecureISN generates a secure Initial Sequence number based on the 148 // recommendation here https://tools.ietf.org/html/rfc6528#page-3. 149 func generateSecureISN(id stack.TransportEndpointID, seed uint32) seqnum.Value { 150 isnHasher := jenkins.Sum32(seed) 151 isnHasher.Write([]byte(id.LocalAddress)) 152 isnHasher.Write([]byte(id.RemoteAddress)) 153 portBuf := make([]byte, 2) 154 binary.LittleEndian.PutUint16(portBuf, id.LocalPort) 155 isnHasher.Write(portBuf) 156 binary.LittleEndian.PutUint16(portBuf, id.RemotePort) 157 isnHasher.Write(portBuf) 158 // The time period here is 64ns. This is similar to what linux uses 159 // generate a sequence number that overlaps less than one 160 // time per MSL (2 minutes). 161 // 162 // A 64ns clock ticks 10^9/64 = 15625000) times in a second. 163 // To wrap the whole 32 bit space would require 164 // 2^32/1562500 ~ 274 seconds. 165 // 166 // Which sort of guarantees that we won't reuse the ISN for a new 167 // connection for the same tuple for at least 274s. 168 isn := isnHasher.Sum32() + uint32(time.Now().UnixNano()>>6) 169 return seqnum.Value(isn) 170 } 171 172 // effectiveRcvWndScale returns the effective receive window scale to be used. 173 // If the peer doesn't support window scaling, the effective rcv wnd scale is 174 // zero; otherwise it's the value calculated based on the initial rcv wnd. 175 func (h *handshake) effectiveRcvWndScale() uint8 { 176 if h.sndWndScale < 0 { 177 return 0 178 } 179 return uint8(h.rcvWndScale) 180 } 181 182 // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD 183 // state. 184 func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions) { 185 h.active = false 186 h.state = handshakeSynRcvd 187 h.flags = header.TCPFlagSyn | header.TCPFlagAck 188 h.iss = iss 189 h.ackNum = irs + 1 190 h.mss = opts.MSS 191 h.sndWndScale = opts.WS 192 h.ep.mu.Lock() 193 h.ep.state = StateSynRecv 194 h.ep.mu.Unlock() 195 } 196 197 // checkAck checks if the ACK number, if present, of a segment received during 198 // a TCP 3-way handshake is valid. If it's not, a RST segment is sent back in 199 // response. 200 func (h *handshake) checkAck(s *segment) bool { 201 if s.flagIsSet(header.TCPFlagAck) && s.ackNumber != h.iss+1 { 202 // RFC 793, page 36, states that a reset must be generated when 203 // the connection is in any non-synchronized state and an 204 // incoming segment acknowledges something not yet sent. The 205 // connection remains in the same state. 206 ack := s.sequenceNumber.Add(s.logicalLen()) 207 h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, s.ackNumber, ack, 0) 208 return false 209 } 210 211 return true 212 } 213 214 // synSentState handles a segment received when the TCP 3-way handshake is in 215 // the SYN-SENT state. 216 func (h *handshake) synSentState(s *segment) *tcpip.Error { 217 // RFC 793, page 37, states that in the SYN-SENT state, a reset is 218 // acceptable if the ack field acknowledges the SYN. 219 if s.flagIsSet(header.TCPFlagRst) { 220 if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == h.iss+1 { 221 return tcpip.ErrConnectionRefused 222 } 223 return nil 224 } 225 226 if !h.checkAck(s) { 227 return nil 228 } 229 230 // We are in the SYN-SENT state. We only care about segments that have 231 // the SYN flag. 232 if !s.flagIsSet(header.TCPFlagSyn) { 233 return nil 234 } 235 236 // Parse the SYN options. 237 rcvSynOpts := parseSynSegmentOptions(s) 238 239 // Remember if the Timestamp option was negotiated. 240 h.ep.maybeEnableTimestamp(&rcvSynOpts) 241 242 // Remember if the SACKPermitted option was negotiated. 243 h.ep.maybeEnableSACKPermitted(&rcvSynOpts) 244 245 // Remember the sequence we'll ack from now on. 246 h.ackNum = s.sequenceNumber + 1 247 h.flags |= header.TCPFlagAck 248 h.mss = rcvSynOpts.MSS 249 h.sndWndScale = rcvSynOpts.WS 250 251 // If this is a SYN ACK response, we only need to acknowledge the SYN 252 // and the handshake is completed. 253 if s.flagIsSet(header.TCPFlagAck) { 254 h.state = handshakeCompleted 255 h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale()) 256 return nil 257 } 258 259 // A SYN segment was received, but no ACK in it. We acknowledge the SYN 260 // but resend our own SYN and wait for it to be acknowledged in the 261 // SYN-RCVD state. 262 h.state = handshakeSynRcvd 263 h.ep.mu.Lock() 264 h.ep.state = StateSynRecv 265 ttl := h.ep.ttl 266 h.ep.mu.Unlock() 267 synOpts := header.TCPSynOptions{ 268 WS: int(h.effectiveRcvWndScale()), 269 TS: rcvSynOpts.TS, 270 TSVal: h.ep.timestamp(), 271 TSEcr: h.ep.recentTS, 272 273 // We only send SACKPermitted if the other side indicated it 274 // permits SACK. This is not explicitly defined in the RFC but 275 // this is the behaviour implemented by Linux. 276 SACKPermitted: rcvSynOpts.SACKPermitted, 277 MSS: h.ep.amss, 278 } 279 if ttl == 0 { 280 ttl = s.route.DefaultTTL() 281 } 282 h.ep.sendSynTCP(&s.route, h.ep.ID, ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts) 283 return nil 284 } 285 286 // synRcvdState handles a segment received when the TCP 3-way handshake is in 287 // the SYN-RCVD state. 288 func (h *handshake) synRcvdState(s *segment) *tcpip.Error { 289 if s.flagIsSet(header.TCPFlagRst) { 290 // RFC 793, page 37, states that in the SYN-RCVD state, a reset 291 // is acceptable if the sequence number is in the window. 292 if s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) { 293 return tcpip.ErrConnectionRefused 294 } 295 return nil 296 } 297 298 if !h.checkAck(s) { 299 return nil 300 } 301 302 if s.flagIsSet(header.TCPFlagSyn) && s.sequenceNumber != h.ackNum-1 { 303 // We received two SYN segments with different sequence 304 // numbers, so we reset this and restart the whole 305 // process, except that we don't reset the timer. 306 ack := s.sequenceNumber.Add(s.logicalLen()) 307 seq := seqnum.Value(0) 308 if s.flagIsSet(header.TCPFlagAck) { 309 seq = s.ackNumber 310 } 311 h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0) 312 313 if !h.active { 314 return tcpip.ErrInvalidEndpointState 315 } 316 317 h.resetState() 318 synOpts := header.TCPSynOptions{ 319 WS: h.rcvWndScale, 320 TS: h.ep.sendTSOk, 321 TSVal: h.ep.timestamp(), 322 TSEcr: h.ep.recentTS, 323 SACKPermitted: h.ep.sackPermitted, 324 MSS: h.ep.amss, 325 } 326 h.ep.sendSynTCP(&s.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts) 327 return nil 328 } 329 330 // We have previously received (and acknowledged) the peer's SYN. If the 331 // peer acknowledges our SYN, the handshake is completed. 332 if s.flagIsSet(header.TCPFlagAck) { 333 // If the timestamp option is negotiated and the segment does 334 // not carry a timestamp option then the segment must be dropped 335 // as per https://tools.ietf.org/html/rfc7323#section-3.2. 336 if h.ep.sendTSOk && !s.parsedOptions.TS { 337 h.ep.stack.Stats().DroppedPackets.Increment() 338 return nil 339 } 340 341 // Update timestamp if required. See RFC7323, section-4.3. 342 if h.ep.sendTSOk && s.parsedOptions.TS { 343 h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber) 344 } 345 h.state = handshakeCompleted 346 return nil 347 } 348 349 return nil 350 } 351 352 func (h *handshake) handleSegment(s *segment) *tcpip.Error { 353 h.sndWnd = s.window 354 if !s.flagIsSet(header.TCPFlagSyn) && h.sndWndScale > 0 { 355 h.sndWnd <<= uint8(h.sndWndScale) 356 } 357 358 switch h.state { 359 case handshakeSynRcvd: 360 return h.synRcvdState(s) 361 case handshakeSynSent: 362 return h.synSentState(s) 363 } 364 return nil 365 } 366 367 // processSegments goes through the segment queue and processes up to 368 // maxSegmentsPerWake (if they're available). 369 func (h *handshake) processSegments() *tcpip.Error { 370 for i := 0; i < maxSegmentsPerWake; i++ { 371 s := h.ep.segmentQueue.dequeue() 372 if s == nil { 373 return nil 374 } 375 376 err := h.handleSegment(s) 377 s.decRef() 378 if err != nil { 379 return err 380 } 381 382 // We stop processing packets once the handshake is completed, 383 // otherwise we may process packets meant to be processed by 384 // the main protocol goroutine. 385 if h.state == handshakeCompleted { 386 break 387 } 388 } 389 390 // If the queue is not empty, make sure we'll wake up in the next 391 // iteration. 392 if !h.ep.segmentQueue.empty() { 393 h.ep.newSegmentWaker.Assert() 394 } 395 396 return nil 397 } 398 399 func (h *handshake) resolveRoute() *tcpip.Error { 400 // Set up the wakers. 401 s := sleep.Sleeper{} 402 resolutionWaker := &sleep.Waker{} 403 s.AddWaker(resolutionWaker, wakerForResolution) 404 s.AddWaker(&h.ep.notificationWaker, wakerForNotification) 405 defer s.Done() 406 407 // Initial action is to resolve route. 408 index := wakerForResolution 409 for { 410 switch index { 411 case wakerForResolution: 412 if _, err := h.ep.route.Resolve(resolutionWaker); err != tcpip.ErrWouldBlock { 413 if err == tcpip.ErrNoLinkAddress { 414 h.ep.stats.SendErrors.NoLinkAddr.Increment() 415 } else if err != nil { 416 h.ep.stats.SendErrors.NoRoute.Increment() 417 } 418 // Either success (err == nil) or failure. 419 return err 420 } 421 // Resolution not completed. Keep trying... 422 423 case wakerForNotification: 424 n := h.ep.fetchNotifications() 425 if n¬ifyClose != 0 { 426 h.ep.route.RemoveWaker(resolutionWaker) 427 return tcpip.ErrAborted 428 } 429 if n¬ifyDrain != 0 { 430 close(h.ep.drainDone) 431 <-h.ep.undrain 432 } 433 } 434 435 // Wait for notification. 436 index, _ = s.Fetch(true) 437 } 438 } 439 440 // execute executes the TCP 3-way handshake. 441 func (h *handshake) execute() *tcpip.Error { 442 if h.ep.route.IsResolutionRequired() { 443 if err := h.resolveRoute(); err != nil { 444 return err 445 } 446 } 447 448 // Initialize the resend timer. 449 resendWaker := sleep.Waker{} 450 timeOut := time.Duration(time.Second) 451 rt := time.AfterFunc(timeOut, func() { 452 resendWaker.Assert() 453 }) 454 defer rt.Stop() 455 456 // Set up the wakers. 457 s := sleep.Sleeper{} 458 s.AddWaker(&resendWaker, wakerForResend) 459 s.AddWaker(&h.ep.notificationWaker, wakerForNotification) 460 s.AddWaker(&h.ep.newSegmentWaker, wakerForNewSegment) 461 defer s.Done() 462 463 var sackEnabled SACKEnabled 464 if err := h.ep.stack.TransportProtocolOption(ProtocolNumber, &sackEnabled); err != nil { 465 // If stack returned an error when checking for SACKEnabled 466 // status then just default to switching off SACK negotiation. 467 sackEnabled = false 468 } 469 470 // Send the initial SYN segment and loop until the handshake is 471 // completed. 472 h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route) 473 474 synOpts := header.TCPSynOptions{ 475 WS: h.rcvWndScale, 476 TS: true, 477 TSVal: h.ep.timestamp(), 478 TSEcr: h.ep.recentTS, 479 SACKPermitted: bool(sackEnabled), 480 MSS: h.ep.amss, 481 } 482 483 // Execute is also called in a listen context so we want to make sure we 484 // only send the TS/SACK option when we received the TS/SACK in the 485 // initial SYN. 486 if h.state == handshakeSynRcvd { 487 synOpts.TS = h.ep.sendTSOk 488 synOpts.SACKPermitted = h.ep.sackPermitted && bool(sackEnabled) 489 if h.sndWndScale < 0 { 490 // Disable window scaling if the peer did not send us 491 // the window scaling option. 492 synOpts.WS = -1 493 } 494 } 495 h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts) 496 497 for h.state != handshakeCompleted { 498 switch index, _ := s.Fetch(true); index { 499 case wakerForResend: 500 timeOut *= 2 501 if timeOut > 60*time.Second { 502 return tcpip.ErrTimeout 503 } 504 rt.Reset(timeOut) 505 h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts) 506 507 case wakerForNotification: 508 n := h.ep.fetchNotifications() 509 if n¬ifyClose != 0 { 510 return tcpip.ErrAborted 511 } 512 if n¬ifyDrain != 0 { 513 for !h.ep.segmentQueue.empty() { 514 s := h.ep.segmentQueue.dequeue() 515 err := h.handleSegment(s) 516 s.decRef() 517 if err != nil { 518 return err 519 } 520 if h.state == handshakeCompleted { 521 return nil 522 } 523 } 524 close(h.ep.drainDone) 525 <-h.ep.undrain 526 } 527 528 case wakerForNewSegment: 529 if err := h.processSegments(); err != nil { 530 return err 531 } 532 } 533 } 534 535 return nil 536 } 537 538 func parseSynSegmentOptions(s *segment) header.TCPSynOptions { 539 synOpts := header.ParseSynOptions(s.options, s.flagIsSet(header.TCPFlagAck)) 540 if synOpts.TS { 541 s.parsedOptions.TSVal = synOpts.TSVal 542 s.parsedOptions.TSEcr = synOpts.TSEcr 543 } 544 return synOpts 545 } 546 547 var optionPool = sync.Pool{ 548 New: func() interface{} { 549 return make([]byte, maxOptionSize) 550 }, 551 } 552 553 func getOptions() []byte { 554 return optionPool.Get().([]byte) 555 } 556 557 func putOptions(options []byte) { 558 // Reslice to full capacity. 559 optionPool.Put(options[0:cap(options)]) 560 } 561 562 func makeSynOptions(opts header.TCPSynOptions) []byte { 563 // Emulate linux option order. This is as follows: 564 // 565 // if md5: NOP NOP MD5SIG 18 md5sig(16) 566 // if mss: MSS 4 mss(2) 567 // if ts and sack_advertise: 568 // SACK 2 TIMESTAMP 2 timestamp(8) 569 // elif ts: NOP NOP TIMESTAMP 10 timestamp(8) 570 // elif sack: NOP NOP SACK 2 571 // if wscale: NOP WINDOW 3 ws(1) 572 // if sack_blocks: NOP NOP SACK ((2 + (#blocks * 8)) 573 // [for each block] start_seq(4) end_seq(4) 574 // if fastopen_cookie: 575 // if exp: EXP (4 + len(cookie)) FASTOPEN_MAGIC(2) 576 // else: FASTOPEN (2 + len(cookie)) 577 // cookie(variable) [padding to four bytes] 578 // 579 options := getOptions() 580 581 // Always encode the mss. 582 offset := header.EncodeMSSOption(uint32(opts.MSS), options) 583 584 // Special ordering is required here. If both TS and SACK are enabled, 585 // then the SACK option precedes TS, with no padding. If they are 586 // enabled individually, then we see padding before the option. 587 if opts.TS && opts.SACKPermitted { 588 offset += header.EncodeSACKPermittedOption(options[offset:]) 589 offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:]) 590 } else if opts.TS { 591 offset += header.EncodeNOP(options[offset:]) 592 offset += header.EncodeNOP(options[offset:]) 593 offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:]) 594 } else if opts.SACKPermitted { 595 offset += header.EncodeNOP(options[offset:]) 596 offset += header.EncodeNOP(options[offset:]) 597 offset += header.EncodeSACKPermittedOption(options[offset:]) 598 } 599 600 // Initialize the WS option. 601 if opts.WS >= 0 { 602 offset += header.EncodeNOP(options[offset:]) 603 offset += header.EncodeWSOption(opts.WS, options[offset:]) 604 } 605 606 // Padding to the end; note that this never apply unless we add a 607 // fastopen option, we always expect the offset to remain the same. 608 if delta := header.AddTCPOptionPadding(options, offset); delta != 0 { 609 panic("unexpected option encoding") 610 } 611 612 return options[:offset] 613 } 614 615 func (e *endpoint) sendSynTCP(r *stack.Route, id stack.TransportEndpointID, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts header.TCPSynOptions) *tcpip.Error { 616 options := makeSynOptions(opts) 617 // We ignore SYN send errors and let the callers re-attempt send. 618 if err := e.sendTCP(r, id, buffer.VectorisedView{}, ttl, tos, flags, seq, ack, rcvWnd, options, nil); err != nil { 619 e.stats.SendErrors.SynSendToNetworkFailed.Increment() 620 } 621 putOptions(options) 622 return nil 623 } 624 625 func (e *endpoint) sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error { 626 if err := sendTCP(r, id, data, ttl, tos, flags, seq, ack, rcvWnd, opts, gso); err != nil { 627 e.stats.SendErrors.SegmentSendToNetworkFailed.Increment() 628 return err 629 } 630 e.stats.SegmentsSent.Increment() 631 return nil 632 } 633 634 func buildTCPHdr(r *stack.Route, id stack.TransportEndpointID, d *stack.PacketDescriptor, data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) { 635 optLen := len(opts) 636 hdr := &d.Hdr 637 packetSize := d.Size 638 off := d.Off 639 // Initialize the header. 640 tcp := header.TCP(hdr.Prepend(header.TCPMinimumSize + optLen)) 641 tcp.Encode(&header.TCPFields{ 642 SrcPort: id.LocalPort, 643 DstPort: id.RemotePort, 644 SeqNum: uint32(seq), 645 AckNum: uint32(ack), 646 DataOffset: uint8(header.TCPMinimumSize + optLen), 647 Flags: flags, 648 WindowSize: uint16(rcvWnd), 649 }) 650 copy(tcp[header.TCPMinimumSize:], opts) 651 652 length := uint16(hdr.UsedLength() + packetSize) 653 xsum := r.PseudoHeaderChecksum(ProtocolNumber, length) 654 // Only calculate the checksum if offloading isn't supported. 655 if gso != nil && gso.NeedsCsum { 656 // This is called CHECKSUM_PARTIAL in the Linux kernel. We 657 // calculate a checksum of the pseudo-header and save it in the 658 // TCP header, then the kernel calculate a checksum of the 659 // header and data and get the right sum of the TCP packet. 660 tcp.SetChecksum(xsum) 661 } else if r.Capabilities()&stack.CapabilityTXChecksumOffload == 0 { 662 xsum = header.ChecksumVVWithOffset(data, xsum, off, packetSize) 663 tcp.SetChecksum(^tcp.CalculateChecksum(xsum)) 664 } 665 666 } 667 668 func sendTCPBatch(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error { 669 optLen := len(opts) 670 if rcvWnd > 0xffff { 671 rcvWnd = 0xffff 672 } 673 674 mss := int(gso.MSS) 675 n := (data.Size() + mss - 1) / mss 676 677 hdrs := stack.NewPacketDescriptors(n, header.TCPMinimumSize+int(r.MaxHeaderLength())+optLen) 678 679 size := data.Size() 680 off := 0 681 for i := 0; i < n; i++ { 682 packetSize := mss 683 if packetSize > size { 684 packetSize = size 685 } 686 size -= packetSize 687 hdrs[i].Off = off 688 hdrs[i].Size = packetSize 689 buildTCPHdr(r, id, &hdrs[i], data, flags, seq, ack, rcvWnd, opts, gso) 690 off += packetSize 691 seq = seq.Add(seqnum.Size(packetSize)) 692 } 693 if ttl == 0 { 694 ttl = r.DefaultTTL() 695 } 696 sent, err := r.WritePackets(gso, hdrs, data, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}) 697 if err != nil { 698 r.Stats().TCP.SegmentSendErrors.IncrementBy(uint64(n - sent)) 699 } 700 r.Stats().TCP.SegmentsSent.IncrementBy(uint64(sent)) 701 return err 702 } 703 704 // sendTCP sends a TCP segment with the provided options via the provided 705 // network endpoint and under the provided identity. 706 func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error { 707 optLen := len(opts) 708 if rcvWnd > 0xffff { 709 rcvWnd = 0xffff 710 } 711 712 if r.Loop&stack.PacketLoop == 0 && gso != nil && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() { 713 return sendTCPBatch(r, id, data, ttl, tos, flags, seq, ack, rcvWnd, opts, gso) 714 } 715 716 d := &stack.PacketDescriptor{ 717 Hdr: buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen), 718 Off: 0, 719 Size: data.Size(), 720 } 721 buildTCPHdr(r, id, d, data, flags, seq, ack, rcvWnd, opts, gso) 722 723 if ttl == 0 { 724 ttl = r.DefaultTTL() 725 } 726 if err := r.WritePacket(gso, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}, tcpip.PacketBuffer{ 727 Header: d.Hdr, 728 Data: data, 729 }); err != nil { 730 r.Stats().TCP.SegmentSendErrors.Increment() 731 return err 732 } 733 r.Stats().TCP.SegmentsSent.Increment() 734 if (flags & header.TCPFlagRst) != 0 { 735 r.Stats().TCP.ResetsSent.Increment() 736 } 737 return nil 738 } 739 740 // makeOptions makes an options slice. 741 func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte { 742 options := getOptions() 743 offset := 0 744 745 // N.B. the ordering here matches the ordering used by Linux internally 746 // and described in the raw makeOptions function. We don't include 747 // unnecessary cases here (post connection.) 748 if e.sendTSOk { 749 // Embed the timestamp if timestamp has been enabled. 750 // 751 // We only use the lower 32 bits of the unix time in 752 // milliseconds. This is similar to what Linux does where it 753 // uses the lower 32 bits of the jiffies value in the tsVal 754 // field of the timestamp option. 755 // 756 // Further, RFC7323 section-5.4 recommends millisecond 757 // resolution as the lowest recommended resolution for the 758 // timestamp clock. 759 // 760 // Ref: https://tools.ietf.org/html/rfc7323#section-5.4. 761 offset += header.EncodeNOP(options[offset:]) 762 offset += header.EncodeNOP(options[offset:]) 763 offset += header.EncodeTSOption(e.timestamp(), uint32(e.recentTS), options[offset:]) 764 } 765 if e.sackPermitted && len(sackBlocks) > 0 { 766 offset += header.EncodeNOP(options[offset:]) 767 offset += header.EncodeNOP(options[offset:]) 768 offset += header.EncodeSACKBlocks(sackBlocks, options[offset:]) 769 } 770 771 // We expect the above to produce an aligned offset. 772 if delta := header.AddTCPOptionPadding(options, offset); delta != 0 { 773 panic("unexpected option encoding") 774 } 775 776 return options[:offset] 777 } 778 779 // sendRaw sends a TCP segment to the endpoint's peer. 780 func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size) *tcpip.Error { 781 var sackBlocks []header.SACKBlock 782 if e.state == StateEstablished && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) { 783 sackBlocks = e.sack.Blocks[:e.sack.NumBlocks] 784 } 785 options := e.makeOptions(sackBlocks) 786 err := e.sendTCP(&e.route, e.ID, data, e.ttl, e.sendTOS, flags, seq, ack, rcvWnd, options, e.gso) 787 putOptions(options) 788 return err 789 } 790 791 func (e *endpoint) handleWrite() *tcpip.Error { 792 // Move packets from send queue to send list. The queue is accessible 793 // from other goroutines and protected by the send mutex, while the send 794 // list is only accessible from the handler goroutine, so it needs no 795 // mutexes. 796 e.sndBufMu.Lock() 797 798 first := e.sndQueue.Front() 799 if first != nil { 800 e.snd.writeList.PushBackList(&e.sndQueue) 801 e.snd.sndNxtList.UpdateForward(e.sndBufInQueue) 802 e.sndBufInQueue = 0 803 } 804 805 e.sndBufMu.Unlock() 806 807 // Initialize the next segment to write if it's currently nil. 808 if e.snd.writeNext == nil { 809 e.snd.writeNext = first 810 } 811 812 // Push out any new packets. 813 e.snd.sendData() 814 815 return nil 816 } 817 818 func (e *endpoint) handleClose() *tcpip.Error { 819 // Drain the send queue. 820 e.handleWrite() 821 822 // Mark send side as closed. 823 e.snd.closed = true 824 825 return nil 826 } 827 828 // resetConnectionLocked puts the endpoint in an error state with the given 829 // error code and sends a RST if and only if the error is not ErrConnectionReset 830 // indicating that the connection is being reset due to receiving a RST. This 831 // method must only be called from the protocol goroutine. 832 func (e *endpoint) resetConnectionLocked(err *tcpip.Error) { 833 // Only send a reset if the connection is being aborted for a reason 834 // other than receiving a reset. 835 if e.state == StateEstablished || e.state == StateCloseWait { 836 e.stack.Stats().TCP.EstablishedResets.Increment() 837 e.stack.Stats().TCP.CurrentEstablished.Decrement() 838 } 839 e.state = StateError 840 e.HardError = err 841 if err != tcpip.ErrConnectionReset { 842 // The exact sequence number to be used for the RST is the same as the 843 // one used by Linux. We need to handle the case of window being shrunk 844 // which can cause sndNxt to be outside the acceptable window on the 845 // receiver. 846 // 847 // See: https://www.snellman.net/blog/archive/2016-02-01-tcp-rst/ for more 848 // information. 849 sndWndEnd := e.snd.sndUna.Add(e.snd.sndWnd) 850 resetSeqNum := sndWndEnd 851 if !sndWndEnd.LessThan(e.snd.sndNxt) || e.snd.sndNxt.Size(sndWndEnd) < (1<<e.snd.sndWndScale) { 852 resetSeqNum = e.snd.sndNxt 853 } 854 e.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck|header.TCPFlagRst, resetSeqNum, e.rcv.rcvNxt, 0) 855 } 856 } 857 858 // completeWorkerLocked is called by the worker goroutine when it's about to 859 // exit. It marks the worker as completed and performs cleanup work if requested 860 // by Close(). 861 func (e *endpoint) completeWorkerLocked() { 862 e.workerRunning = false 863 if e.workerCleanup { 864 e.cleanupLocked() 865 } 866 } 867 868 // transitionToStateCloseLocked ensures that the endpoint is 869 // cleaned up from the transport demuxer, "before" moving to 870 // StateClose. This will ensure that no packet will be 871 // delivered to this endpoint from the demuxer when the endpoint 872 // is transitioned to StateClose. 873 func (e *endpoint) transitionToStateCloseLocked() { 874 if e.state == StateClose { 875 return 876 } 877 e.cleanupLocked() 878 e.state = StateClose 879 } 880 881 // tryDeliverSegmentFromClosedEndpoint attempts to deliver the parsed 882 // segment to any other endpoint other than the current one. This is called 883 // only when the endpoint is in StateClose and we want to deliver the segment 884 // to any other listening endpoint. We reply with RST if we cannot find one. 885 func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) { 886 ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.ID, &s.route) 887 if ep == nil { 888 replyWithReset(s) 889 s.decRef() 890 return 891 } 892 ep.(*endpoint).enqueueSegment(s) 893 } 894 895 func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) { 896 if e.rcv.acceptable(s.sequenceNumber, 0) { 897 // RFC 793, page 37 states that "in all states 898 // except SYN-SENT, all reset (RST) segments are 899 // validated by checking their SEQ-fields." So 900 // we only process it if it's acceptable. 901 s.decRef() 902 e.mu.Lock() 903 switch e.state { 904 // In case of a RST in CLOSE-WAIT linux moves 905 // the socket to closed state with an error set 906 // to indicate EPIPE. 907 // 908 // Technically this seems to be at odds w/ RFC. 909 // As per https://tools.ietf.org/html/rfc793#section-2.7 910 // page 69 the behavior for a segment arriving 911 // w/ RST bit set in CLOSE-WAIT is inlined below. 912 // 913 // ESTABLISHED 914 // FIN-WAIT-1 915 // FIN-WAIT-2 916 // CLOSE-WAIT 917 918 // If the RST bit is set then, any outstanding RECEIVEs and 919 // SEND should receive "reset" responses. All segment queues 920 // should be flushed. Users should also receive an unsolicited 921 // general "connection reset" signal. Enter the CLOSED state, 922 // delete the TCB, and return. 923 case StateCloseWait: 924 e.transitionToStateCloseLocked() 925 e.HardError = tcpip.ErrAborted 926 e.mu.Unlock() 927 return false, nil 928 default: 929 e.mu.Unlock() 930 return false, tcpip.ErrConnectionReset 931 } 932 } 933 return true, nil 934 } 935 936 // handleSegments pulls segments from the queue and processes them. It returns 937 // no error if the protocol loop should continue, an error otherwise. 938 func (e *endpoint) handleSegments() *tcpip.Error { 939 checkRequeue := true 940 for i := 0; i < maxSegmentsPerWake; i++ { 941 e.mu.RLock() 942 state := e.state 943 e.mu.RUnlock() 944 if state == StateClose { 945 // When we get into StateClose while processing from the queue, 946 // return immediately and let the protocolMainloop handle it. 947 // 948 // We can reach StateClose only while processing a previous segment 949 // or a notification from the protocolMainLoop (caller goroutine). 950 // This means that with this return, the segment dequeue below can 951 // never occur on a closed endpoint. 952 return nil 953 } 954 955 s := e.segmentQueue.dequeue() 956 if s == nil { 957 checkRequeue = false 958 break 959 } 960 961 // Invoke the tcp probe if installed. 962 if e.probe != nil { 963 e.probe(e.completeState()) 964 } 965 966 if s.flagIsSet(header.TCPFlagRst) { 967 if ok, err := e.handleReset(s); !ok { 968 return err 969 } 970 } else if s.flagIsSet(header.TCPFlagSyn) { 971 // See: https://tools.ietf.org/html/rfc5961#section-4.1 972 // 1) If the SYN bit is set, irrespective of the sequence number, TCP 973 // MUST send an ACK (also referred to as challenge ACK) to the remote 974 // peer: 975 // 976 // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK> 977 // 978 // After sending the acknowledgment, TCP MUST drop the unacceptable 979 // segment and stop processing further. 980 // 981 // By sending an ACK, the remote peer is challenged to confirm the loss 982 // of the previous connection and the request to start a new connection. 983 // A legitimate peer, after restart, would not have a TCB in the 984 // synchronized state. Thus, when the ACK arrives, the peer should send 985 // a RST segment back with the sequence number derived from the ACK 986 // field that caused the RST. 987 988 // This RST will confirm that the remote peer has indeed closed the 989 // previous connection. Upon receipt of a valid RST, the local TCP 990 // endpoint MUST terminate its connection. The local TCP endpoint 991 // should then rely on SYN retransmission from the remote end to 992 // re-establish the connection. 993 994 e.snd.sendAck() 995 } else if s.flagIsSet(header.TCPFlagAck) { 996 // Patch the window size in the segment according to the 997 // send window scale. 998 s.window <<= e.snd.sndWndScale 999 1000 // RFC 793, page 41 states that "once in the ESTABLISHED 1001 // state all segments must carry current acknowledgment 1002 // information." 1003 drop, err := e.rcv.handleRcvdSegment(s) 1004 if err != nil { 1005 s.decRef() 1006 return err 1007 } 1008 if drop { 1009 s.decRef() 1010 continue 1011 } 1012 e.snd.handleRcvdSegment(s) 1013 } 1014 s.decRef() 1015 } 1016 1017 // If the queue is not empty, make sure we'll wake up in the next 1018 // iteration. 1019 if checkRequeue && !e.segmentQueue.empty() { 1020 e.newSegmentWaker.Assert() 1021 } 1022 1023 // Send an ACK for all processed packets if needed. 1024 if e.rcv.rcvNxt != e.snd.maxSentAck { 1025 e.snd.sendAck() 1026 } 1027 1028 e.resetKeepaliveTimer(true) 1029 1030 return nil 1031 } 1032 1033 // keepaliveTimerExpired is called when the keepaliveTimer fires. We send TCP 1034 // keepalive packets periodically when the connection is idle. If we don't hear 1035 // from the other side after a number of tries, we terminate the connection. 1036 func (e *endpoint) keepaliveTimerExpired() *tcpip.Error { 1037 e.keepalive.Lock() 1038 if !e.keepalive.enabled || !e.keepalive.timer.checkExpiration() { 1039 e.keepalive.Unlock() 1040 return nil 1041 } 1042 1043 if e.keepalive.unacked >= e.keepalive.count { 1044 e.keepalive.Unlock() 1045 return tcpip.ErrTimeout 1046 } 1047 1048 // RFC1122 4.2.3.6: TCP keepalive is a dataless ACK with 1049 // seg.seq = snd.nxt-1. 1050 e.keepalive.unacked++ 1051 e.keepalive.Unlock() 1052 e.snd.sendSegmentFromView(buffer.VectorisedView{}, header.TCPFlagAck, e.snd.sndNxt-1) 1053 e.resetKeepaliveTimer(false) 1054 return nil 1055 } 1056 1057 // resetKeepaliveTimer restarts or stops the keepalive timer, depending on 1058 // whether it is enabled for this endpoint. 1059 func (e *endpoint) resetKeepaliveTimer(receivedData bool) { 1060 e.keepalive.Lock() 1061 defer e.keepalive.Unlock() 1062 if receivedData { 1063 e.keepalive.unacked = 0 1064 } 1065 // Start the keepalive timer IFF it's enabled and there is no pending 1066 // data to send. 1067 if !e.keepalive.enabled || e.snd == nil || e.snd.sndUna != e.snd.sndNxt { 1068 e.keepalive.timer.disable() 1069 return 1070 } 1071 if e.keepalive.unacked > 0 { 1072 e.keepalive.timer.enable(e.keepalive.interval) 1073 } else { 1074 e.keepalive.timer.enable(e.keepalive.idle) 1075 } 1076 } 1077 1078 // disableKeepaliveTimer stops the keepalive timer. 1079 func (e *endpoint) disableKeepaliveTimer() { 1080 e.keepalive.Lock() 1081 e.keepalive.timer.disable() 1082 e.keepalive.Unlock() 1083 } 1084 1085 // protocolMainLoop is the main loop of the TCP protocol. It runs in its own 1086 // goroutine and is responsible for sending segments and handling received 1087 // segments. 1088 func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error { 1089 var closeTimer *time.Timer 1090 var closeWaker sleep.Waker 1091 1092 epilogue := func() { 1093 // e.mu is expected to be hold upon entering this section. 1094 1095 if e.snd != nil { 1096 e.snd.resendTimer.cleanup() 1097 } 1098 1099 if closeTimer != nil { 1100 closeTimer.Stop() 1101 } 1102 1103 e.completeWorkerLocked() 1104 1105 if e.drainDone != nil { 1106 close(e.drainDone) 1107 } 1108 1109 e.mu.Unlock() 1110 // When the protocol loop exits we should wake up our waiters. 1111 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut) 1112 } 1113 1114 if handshake { 1115 // This is an active connection, so we must initiate the 3-way 1116 // handshake, and then inform potential waiters about its 1117 // completion. 1118 initialRcvWnd := e.initialReceiveWindow() 1119 h := newHandshake(e, seqnum.Size(initialRcvWnd)) 1120 e.mu.Lock() 1121 h.ep.state = StateSynSent 1122 e.mu.Unlock() 1123 1124 if err := h.execute(); err != nil { 1125 e.lastErrorMu.Lock() 1126 e.lastError = err 1127 e.lastErrorMu.Unlock() 1128 1129 e.mu.Lock() 1130 e.stack.Stats().TCP.EstablishedResets.Increment() 1131 e.stack.Stats().TCP.CurrentEstablished.Decrement() 1132 e.state = StateError 1133 e.HardError = err 1134 1135 // Lock released below. 1136 epilogue() 1137 1138 return err 1139 } 1140 1141 // Transfer handshake state to TCP connection. We disable 1142 // receive window scaling if the peer doesn't support it 1143 // (indicated by a negative send window scale). 1144 e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale) 1145 1146 rcvBufSize := seqnum.Size(e.receiveBufferSize()) 1147 e.rcvListMu.Lock() 1148 e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale(), rcvBufSize) 1149 // boot strap the auto tuning algorithm. Starting at zero will 1150 // result in a large step function on the first proper causing 1151 // the window to just go to a really large value after the first 1152 // RTT itself. 1153 e.rcvAutoParams.prevCopied = initialRcvWnd 1154 e.rcvListMu.Unlock() 1155 e.stack.Stats().TCP.CurrentEstablished.Increment() 1156 e.mu.Lock() 1157 e.state = StateEstablished 1158 e.mu.Unlock() 1159 } 1160 1161 e.keepalive.timer.init(&e.keepalive.waker) 1162 defer e.keepalive.timer.cleanup() 1163 1164 // Tell waiters that the endpoint is connected and writable. 1165 e.mu.Lock() 1166 drained := e.drainDone != nil 1167 e.mu.Unlock() 1168 if drained { 1169 close(e.drainDone) 1170 <-e.undrain 1171 } 1172 1173 e.waiterQueue.Notify(waiter.EventOut) 1174 1175 // Set up the functions that will be called when the main protocol loop 1176 // wakes up. 1177 funcs := []struct { 1178 w *sleep.Waker 1179 f func() *tcpip.Error 1180 }{ 1181 { 1182 w: &e.sndWaker, 1183 f: e.handleWrite, 1184 }, 1185 { 1186 w: &e.sndCloseWaker, 1187 f: e.handleClose, 1188 }, 1189 { 1190 w: &e.newSegmentWaker, 1191 f: e.handleSegments, 1192 }, 1193 { 1194 w: &closeWaker, 1195 f: func() *tcpip.Error { 1196 // This means the socket is being closed due 1197 // to the TCP_FIN_WAIT2 timeout was hit. Just 1198 // mark the socket as closed. 1199 e.mu.Lock() 1200 e.transitionToStateCloseLocked() 1201 e.mu.Unlock() 1202 return nil 1203 }, 1204 }, 1205 { 1206 w: &e.snd.resendWaker, 1207 f: func() *tcpip.Error { 1208 if !e.snd.retransmitTimerExpired() { 1209 return tcpip.ErrTimeout 1210 } 1211 return nil 1212 }, 1213 }, 1214 { 1215 w: &e.keepalive.waker, 1216 f: e.keepaliveTimerExpired, 1217 }, 1218 { 1219 w: &e.notificationWaker, 1220 f: func() *tcpip.Error { 1221 n := e.fetchNotifications() 1222 if n¬ifyNonZeroReceiveWindow != 0 { 1223 e.rcv.nonZeroWindow() 1224 } 1225 1226 if n¬ifyReceiveWindowChanged != 0 { 1227 e.rcv.pendingBufSize = seqnum.Size(e.receiveBufferSize()) 1228 } 1229 1230 if n¬ifyMTUChanged != 0 { 1231 e.sndBufMu.Lock() 1232 count := e.packetTooBigCount 1233 e.packetTooBigCount = 0 1234 mtu := e.sndMTU 1235 e.sndBufMu.Unlock() 1236 1237 e.snd.updateMaxPayloadSize(mtu, count) 1238 } 1239 1240 if n¬ifyReset != 0 { 1241 e.mu.Lock() 1242 e.resetConnectionLocked(tcpip.ErrConnectionAborted) 1243 e.mu.Unlock() 1244 } 1245 1246 if n¬ifyClose != 0 && closeTimer == nil { 1247 e.mu.Lock() 1248 if e.state == StateFinWait2 && e.closed { 1249 // The socket has been closed and we are in FIN_WAIT2 1250 // so start the FIN_WAIT2 timer. 1251 closeTimer = time.AfterFunc(e.tcpLingerTimeout, func() { 1252 closeWaker.Assert() 1253 }) 1254 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut) 1255 } 1256 e.mu.Unlock() 1257 } 1258 1259 if n¬ifyKeepaliveChanged != 0 { 1260 // The timer could fire in background 1261 // when the endpoint is drained. That's 1262 // OK. See above. 1263 e.resetKeepaliveTimer(true) 1264 } 1265 1266 if n¬ifyDrain != 0 { 1267 for !e.segmentQueue.empty() { 1268 if err := e.handleSegments(); err != nil { 1269 return err 1270 } 1271 } 1272 if e.state != StateClose && e.state != StateError { 1273 // Only block the worker if the endpoint 1274 // is not in closed state or error state. 1275 close(e.drainDone) 1276 <-e.undrain 1277 } 1278 } 1279 1280 if n¬ifyTickleWorker != 0 { 1281 // Just a tickle notification. No need to do 1282 // anything. 1283 return nil 1284 } 1285 1286 return nil 1287 }, 1288 }, 1289 } 1290 1291 // Initialize the sleeper based on the wakers in funcs. 1292 s := sleep.Sleeper{} 1293 for i := range funcs { 1294 s.AddWaker(funcs[i].w, i) 1295 } 1296 1297 // The following assertions and notifications are needed for restored 1298 // endpoints. Fresh newly created endpoints have empty states and should 1299 // not invoke any. 1300 e.segmentQueue.mu.Lock() 1301 if !e.segmentQueue.list.Empty() { 1302 e.newSegmentWaker.Assert() 1303 } 1304 e.segmentQueue.mu.Unlock() 1305 1306 e.rcvListMu.Lock() 1307 if !e.rcvList.Empty() { 1308 e.waiterQueue.Notify(waiter.EventIn) 1309 } 1310 e.rcvListMu.Unlock() 1311 1312 e.mu.Lock() 1313 if e.workerCleanup { 1314 e.notifyProtocolGoroutine(notifyClose) 1315 } 1316 1317 // Main loop. Handle segments until both send and receive ends of the 1318 // connection have completed. 1319 1320 for e.state != StateTimeWait && e.state != StateClose && e.state != StateError { 1321 e.mu.Unlock() 1322 e.workMu.Unlock() 1323 v, _ := s.Fetch(true) 1324 e.workMu.Lock() 1325 if err := funcs[v].f(); err != nil { 1326 e.mu.Lock() 1327 // Ensure we release all endpoint registration and route 1328 // references as the connection is now in an error 1329 // state. 1330 e.workerCleanup = true 1331 e.resetConnectionLocked(err) 1332 // Lock released below. 1333 epilogue() 1334 1335 return nil 1336 } 1337 e.mu.Lock() 1338 } 1339 1340 state := e.state 1341 e.mu.Unlock() 1342 var reuseTW func() 1343 if state == StateTimeWait { 1344 // Disable close timer as we now entering real TIME_WAIT. 1345 if closeTimer != nil { 1346 closeTimer.Stop() 1347 } 1348 // Mark the current sleeper done so as to free all associated 1349 // wakers. 1350 s.Done() 1351 // Wake up any waiters before we enter TIME_WAIT. 1352 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut) 1353 reuseTW = e.doTimeWait() 1354 } 1355 1356 // Mark endpoint as closed. 1357 e.mu.Lock() 1358 if e.state != StateError { 1359 e.stack.Stats().TCP.EstablishedResets.Increment() 1360 e.stack.Stats().TCP.CurrentEstablished.Decrement() 1361 e.transitionToStateCloseLocked() 1362 } 1363 1364 // Lock released below. 1365 epilogue() 1366 1367 // epilogue removes the endpoint from the transport-demuxer and 1368 // unlocks e.mu. Now that no new segments can get enqueued to this 1369 // endpoint, try to re-match the segment to a different endpoint 1370 // as the current endpoint is closed. 1371 for !e.segmentQueue.empty() { 1372 s := e.segmentQueue.dequeue() 1373 e.tryDeliverSegmentFromClosedEndpoint(s) 1374 } 1375 1376 // A new SYN was received during TIME_WAIT and we need to abort 1377 // the timewait and redirect the segment to the listener queue 1378 if reuseTW != nil { 1379 reuseTW() 1380 } 1381 1382 return nil 1383 } 1384 1385 // handleTimeWaitSegments processes segments received during TIME_WAIT 1386 // state. 1387 func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()) { 1388 checkRequeue := true 1389 for i := 0; i < maxSegmentsPerWake; i++ { 1390 s := e.segmentQueue.dequeue() 1391 if s == nil { 1392 checkRequeue = false 1393 break 1394 } 1395 extTW, newSyn := e.rcv.handleTimeWaitSegment(s) 1396 if newSyn { 1397 info := e.EndpointInfo.TransportEndpointInfo 1398 newID := info.ID 1399 newID.RemoteAddress = "" 1400 newID.RemotePort = 0 1401 netProtos := []tcpip.NetworkProtocolNumber{info.NetProto} 1402 // If the local address is an IPv4 address then also 1403 // look for IPv6 dual stack endpoints that might be 1404 // listening on the local address. 1405 if newID.LocalAddress.To4() != "" { 1406 netProtos = []tcpip.NetworkProtocolNumber{header.IPv4ProtocolNumber, header.IPv6ProtocolNumber} 1407 } 1408 for _, netProto := range netProtos { 1409 if listenEP := e.stack.FindTransportEndpoint(netProto, info.TransProto, newID, &s.route); listenEP != nil { 1410 tcpEP := listenEP.(*endpoint) 1411 if EndpointState(tcpEP.State()) == StateListen { 1412 reuseTW = func() { 1413 tcpEP.enqueueSegment(s) 1414 } 1415 // We explicitly do not decRef 1416 // the segment as it's still 1417 // valid and being reflected to 1418 // a listening endpoint. 1419 return false, reuseTW 1420 } 1421 } 1422 } 1423 } 1424 if extTW { 1425 extendTimeWait = true 1426 } 1427 s.decRef() 1428 } 1429 if checkRequeue && !e.segmentQueue.empty() { 1430 e.newSegmentWaker.Assert() 1431 } 1432 return extendTimeWait, nil 1433 } 1434 1435 // doTimeWait is responsible for handling the TCP behaviour once a socket 1436 // enters the TIME_WAIT state. Optionally it can return a closure that 1437 // should be executed after releasing the endpoint registrations. This is 1438 // done in cases where a new SYN is received during TIME_WAIT that carries 1439 // a sequence number larger than one see on the connection. 1440 func (e *endpoint) doTimeWait() (twReuse func()) { 1441 // Trigger a 2 * MSL time wait state. During this period 1442 // we will drop all incoming segments. 1443 // NOTE: On Linux this is not configurable and is fixed at 60 seconds. 1444 timeWaitDuration := DefaultTCPTimeWaitTimeout 1445 1446 // Get the stack wide configuration. 1447 var tcpTW tcpip.TCPTimeWaitTimeoutOption 1448 if err := e.stack.TransportProtocolOption(ProtocolNumber, &tcpTW); err == nil { 1449 timeWaitDuration = time.Duration(tcpTW) 1450 } 1451 1452 const newSegment = 1 1453 const notification = 2 1454 const timeWaitDone = 3 1455 1456 s := sleep.Sleeper{} 1457 s.AddWaker(&e.newSegmentWaker, newSegment) 1458 s.AddWaker(&e.notificationWaker, notification) 1459 1460 var timeWaitWaker sleep.Waker 1461 s.AddWaker(&timeWaitWaker, timeWaitDone) 1462 timeWaitTimer := time.AfterFunc(timeWaitDuration, timeWaitWaker.Assert) 1463 defer timeWaitTimer.Stop() 1464 1465 for { 1466 e.workMu.Unlock() 1467 v, _ := s.Fetch(true) 1468 e.workMu.Lock() 1469 switch v { 1470 case newSegment: 1471 extendTimeWait, reuseTW := e.handleTimeWaitSegments() 1472 if reuseTW != nil { 1473 return reuseTW 1474 } 1475 if extendTimeWait { 1476 timeWaitTimer.Reset(timeWaitDuration) 1477 } 1478 case notification: 1479 n := e.fetchNotifications() 1480 if n¬ifyClose != 0 { 1481 return nil 1482 } 1483 if n¬ifyDrain != 0 { 1484 for !e.segmentQueue.empty() { 1485 // Ignore extending TIME_WAIT during a 1486 // save. For sockets in TIME_WAIT we just 1487 // terminate the TIME_WAIT early. 1488 e.handleTimeWaitSegments() 1489 } 1490 close(e.drainDone) 1491 <-e.undrain 1492 return nil 1493 } 1494 case timeWaitDone: 1495 return nil 1496 } 1497 } 1498 }