github.com/FlowerWrong/netstack@v0.0.0-20191009141956-e5848263af28/tcpip/transport/tcp/connect.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "sync" 19 "time" 20 21 "github.com/FlowerWrong/netstack/rand" 22 "github.com/FlowerWrong/netstack/sleep" 23 "github.com/FlowerWrong/netstack/tcpip" 24 "github.com/FlowerWrong/netstack/tcpip/buffer" 25 "github.com/FlowerWrong/netstack/tcpip/header" 26 "github.com/FlowerWrong/netstack/tcpip/seqnum" 27 "github.com/FlowerWrong/netstack/tcpip/stack" 28 "github.com/FlowerWrong/netstack/waiter" 29 ) 30 31 // maxSegmentsPerWake is the maximum number of segments to process in the main 32 // protocol goroutine per wake-up. Yielding [after this number of segments are 33 // processed] allows other events to be processed as well (e.g., timeouts, 34 // resets, etc.). 35 const maxSegmentsPerWake = 100 36 37 type handshakeState int 38 39 // The following are the possible states of the TCP connection during a 3-way 40 // handshake. A depiction of the states and transitions can be found in RFC 793, 41 // page 23. 42 const ( 43 handshakeSynSent handshakeState = iota 44 handshakeSynRcvd 45 handshakeCompleted 46 ) 47 48 // The following are used to set up sleepers. 49 const ( 50 wakerForNotification = iota 51 wakerForNewSegment 52 wakerForResend 53 wakerForResolution 54 ) 55 56 const ( 57 // Maximum space available for options. 58 maxOptionSize = 40 59 ) 60 61 // handshake holds the state used during a TCP 3-way handshake. 62 type handshake struct { 63 ep *endpoint 64 state handshakeState 65 active bool 66 flags uint8 67 ackNum seqnum.Value 68 69 // iss is the initial send sequence number, as defined in RFC 793. 70 iss seqnum.Value 71 72 // rcvWnd is the receive window, as defined in RFC 793. 73 rcvWnd seqnum.Size 74 75 // sndWnd is the send window, as defined in RFC 793. 76 sndWnd seqnum.Size 77 78 // mss is the maximum segment size received from the peer. 79 mss uint16 80 81 // amss is the maximum segment size advertised by us to the peer. 82 amss uint16 83 84 // sndWndScale is the send window scale, as defined in RFC 1323. A 85 // negative value means no scaling is supported by the peer. 86 sndWndScale int 87 88 // rcvWndScale is the receive window scale, as defined in RFC 1323. 89 rcvWndScale int 90 } 91 92 func newHandshake(ep *endpoint, rcvWnd seqnum.Size) handshake { 93 rcvWndScale := ep.rcvWndScaleForHandshake() 94 95 // Round-down the rcvWnd to a multiple of wndScale. This ensures that the 96 // window offered in SYN won't be reduced due to the loss of precision if 97 // window scaling is enabled after the handshake. 98 rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale) 99 100 // Ensure we can always accept at least 1 byte if the scale specified 101 // was too high for the provided rcvWnd. 102 if rcvWnd == 0 { 103 rcvWnd = 1 104 } 105 106 h := handshake{ 107 ep: ep, 108 active: true, 109 rcvWnd: rcvWnd, 110 rcvWndScale: int(rcvWndScale), 111 } 112 h.resetState() 113 return h 114 } 115 116 // FindWndScale determines the window scale to use for the given maximum window 117 // size. 118 func FindWndScale(wnd seqnum.Size) int { 119 if wnd < 0x10000 { 120 return 0 121 } 122 123 max := seqnum.Size(0xffff) 124 s := 0 125 for wnd > max && s < header.MaxWndScale { 126 s++ 127 max <<= 1 128 } 129 130 return s 131 } 132 133 // resetState resets the state of the handshake object such that it becomes 134 // ready for a new 3-way handshake. 135 func (h *handshake) resetState() { 136 b := make([]byte, 4) 137 if _, err := rand.Read(b); err != nil { 138 panic(err) 139 } 140 141 h.state = handshakeSynSent 142 h.flags = header.TCPFlagSyn 143 h.ackNum = 0 144 h.mss = 0 145 h.iss = seqnum.Value(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24) 146 } 147 148 // effectiveRcvWndScale returns the effective receive window scale to be used. 149 // If the peer doesn't support window scaling, the effective rcv wnd scale is 150 // zero; otherwise it's the value calculated based on the initial rcv wnd. 151 func (h *handshake) effectiveRcvWndScale() uint8 { 152 if h.sndWndScale < 0 { 153 return 0 154 } 155 return uint8(h.rcvWndScale) 156 } 157 158 // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD 159 // state. 160 func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions) { 161 h.active = false 162 h.state = handshakeSynRcvd 163 h.flags = header.TCPFlagSyn | header.TCPFlagAck 164 h.iss = iss 165 h.ackNum = irs + 1 166 h.mss = opts.MSS 167 h.sndWndScale = opts.WS 168 h.ep.mu.Lock() 169 h.ep.state = StateSynRecv 170 h.ep.mu.Unlock() 171 } 172 173 // checkAck checks if the ACK number, if present, of a segment received during 174 // a TCP 3-way handshake is valid. If it's not, a RST segment is sent back in 175 // response. 176 func (h *handshake) checkAck(s *segment) bool { 177 if s.flagIsSet(header.TCPFlagAck) && s.ackNumber != h.iss+1 { 178 // RFC 793, page 36, states that a reset must be generated when 179 // the connection is in any non-synchronized state and an 180 // incoming segment acknowledges something not yet sent. The 181 // connection remains in the same state. 182 ack := s.sequenceNumber.Add(s.logicalLen()) 183 h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, s.ackNumber, ack, 0) 184 return false 185 } 186 187 return true 188 } 189 190 // synSentState handles a segment received when the TCP 3-way handshake is in 191 // the SYN-SENT state. 192 func (h *handshake) synSentState(s *segment) *tcpip.Error { 193 // RFC 793, page 37, states that in the SYN-SENT state, a reset is 194 // acceptable if the ack field acknowledges the SYN. 195 if s.flagIsSet(header.TCPFlagRst) { 196 if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == h.iss+1 { 197 return tcpip.ErrConnectionRefused 198 } 199 return nil 200 } 201 202 if !h.checkAck(s) { 203 return nil 204 } 205 206 // We are in the SYN-SENT state. We only care about segments that have 207 // the SYN flag. 208 if !s.flagIsSet(header.TCPFlagSyn) { 209 return nil 210 } 211 212 // Parse the SYN options. 213 rcvSynOpts := parseSynSegmentOptions(s) 214 215 // Remember if the Timestamp option was negotiated. 216 h.ep.maybeEnableTimestamp(&rcvSynOpts) 217 218 // Remember if the SACKPermitted option was negotiated. 219 h.ep.maybeEnableSACKPermitted(&rcvSynOpts) 220 221 // Remember the sequence we'll ack from now on. 222 h.ackNum = s.sequenceNumber + 1 223 h.flags |= header.TCPFlagAck 224 h.mss = rcvSynOpts.MSS 225 h.sndWndScale = rcvSynOpts.WS 226 227 // If this is a SYN ACK response, we only need to acknowledge the SYN 228 // and the handshake is completed. 229 if s.flagIsSet(header.TCPFlagAck) { 230 h.state = handshakeCompleted 231 h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale()) 232 return nil 233 } 234 235 // A SYN segment was received, but no ACK in it. We acknowledge the SYN 236 // but resend our own SYN and wait for it to be acknowledged in the 237 // SYN-RCVD state. 238 h.state = handshakeSynRcvd 239 h.ep.mu.Lock() 240 h.ep.state = StateSynRecv 241 ttl := h.ep.ttl 242 h.ep.mu.Unlock() 243 synOpts := header.TCPSynOptions{ 244 WS: int(h.effectiveRcvWndScale()), 245 TS: rcvSynOpts.TS, 246 TSVal: h.ep.timestamp(), 247 TSEcr: h.ep.recentTS, 248 249 // We only send SACKPermitted if the other side indicated it 250 // permits SACK. This is not explicitly defined in the RFC but 251 // this is the behaviour implemented by Linux. 252 SACKPermitted: rcvSynOpts.SACKPermitted, 253 MSS: h.ep.amss, 254 } 255 if ttl == 0 { 256 ttl = s.route.DefaultTTL() 257 } 258 sendSynTCP(&s.route, h.ep.id, ttl, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts) 259 260 return nil 261 } 262 263 // synRcvdState handles a segment received when the TCP 3-way handshake is in 264 // the SYN-RCVD state. 265 func (h *handshake) synRcvdState(s *segment) *tcpip.Error { 266 if s.flagIsSet(header.TCPFlagRst) { 267 // RFC 793, page 37, states that in the SYN-RCVD state, a reset 268 // is acceptable if the sequence number is in the window. 269 if s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) { 270 return tcpip.ErrConnectionRefused 271 } 272 return nil 273 } 274 275 if !h.checkAck(s) { 276 return nil 277 } 278 279 if s.flagIsSet(header.TCPFlagSyn) && s.sequenceNumber != h.ackNum-1 { 280 // We received two SYN segments with different sequence 281 // numbers, so we reset this and restart the whole 282 // process, except that we don't reset the timer. 283 ack := s.sequenceNumber.Add(s.logicalLen()) 284 seq := seqnum.Value(0) 285 if s.flagIsSet(header.TCPFlagAck) { 286 seq = s.ackNumber 287 } 288 h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0) 289 290 if !h.active { 291 return tcpip.ErrInvalidEndpointState 292 } 293 294 h.resetState() 295 synOpts := header.TCPSynOptions{ 296 WS: h.rcvWndScale, 297 TS: h.ep.sendTSOk, 298 TSVal: h.ep.timestamp(), 299 TSEcr: h.ep.recentTS, 300 SACKPermitted: h.ep.sackPermitted, 301 MSS: h.ep.amss, 302 } 303 sendSynTCP(&s.route, h.ep.id, h.ep.ttl, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts) 304 return nil 305 } 306 307 // We have previously received (and acknowledged) the peer's SYN. If the 308 // peer acknowledges our SYN, the handshake is completed. 309 if s.flagIsSet(header.TCPFlagAck) { 310 // If the timestamp option is negotiated and the segment does 311 // not carry a timestamp option then the segment must be dropped 312 // as per https://tools.ietf.org/html/rfc7323#section-3.2. 313 if h.ep.sendTSOk && !s.parsedOptions.TS { 314 h.ep.stack.Stats().DroppedPackets.Increment() 315 return nil 316 } 317 318 // Update timestamp if required. See RFC7323, section-4.3. 319 if h.ep.sendTSOk && s.parsedOptions.TS { 320 h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber) 321 } 322 h.state = handshakeCompleted 323 return nil 324 } 325 326 return nil 327 } 328 329 func (h *handshake) handleSegment(s *segment) *tcpip.Error { 330 h.sndWnd = s.window 331 if !s.flagIsSet(header.TCPFlagSyn) && h.sndWndScale > 0 { 332 h.sndWnd <<= uint8(h.sndWndScale) 333 } 334 335 switch h.state { 336 case handshakeSynRcvd: 337 return h.synRcvdState(s) 338 case handshakeSynSent: 339 return h.synSentState(s) 340 } 341 return nil 342 } 343 344 // processSegments goes through the segment queue and processes up to 345 // maxSegmentsPerWake (if they're available). 346 func (h *handshake) processSegments() *tcpip.Error { 347 for i := 0; i < maxSegmentsPerWake; i++ { 348 s := h.ep.segmentQueue.dequeue() 349 if s == nil { 350 return nil 351 } 352 353 err := h.handleSegment(s) 354 s.decRef() 355 if err != nil { 356 return err 357 } 358 359 // We stop processing packets once the handshake is completed, 360 // otherwise we may process packets meant to be processed by 361 // the main protocol goroutine. 362 if h.state == handshakeCompleted { 363 break 364 } 365 } 366 367 // If the queue is not empty, make sure we'll wake up in the next 368 // iteration. 369 if !h.ep.segmentQueue.empty() { 370 h.ep.newSegmentWaker.Assert() 371 } 372 373 return nil 374 } 375 376 func (h *handshake) resolveRoute() *tcpip.Error { 377 // Set up the wakers. 378 s := sleep.Sleeper{} 379 resolutionWaker := &sleep.Waker{} 380 s.AddWaker(resolutionWaker, wakerForResolution) 381 s.AddWaker(&h.ep.notificationWaker, wakerForNotification) 382 defer s.Done() 383 384 // Initial action is to resolve route. 385 index := wakerForResolution 386 for { 387 switch index { 388 case wakerForResolution: 389 if _, err := h.ep.route.Resolve(resolutionWaker); err != tcpip.ErrWouldBlock { 390 // Either success (err == nil) or failure. 391 return err 392 } 393 // Resolution not completed. Keep trying... 394 395 case wakerForNotification: 396 n := h.ep.fetchNotifications() 397 if n¬ifyClose != 0 { 398 h.ep.route.RemoveWaker(resolutionWaker) 399 return tcpip.ErrAborted 400 } 401 if n¬ifyDrain != 0 { 402 close(h.ep.drainDone) 403 <-h.ep.undrain 404 } 405 } 406 407 // Wait for notification. 408 index, _ = s.Fetch(true) 409 } 410 } 411 412 // execute executes the TCP 3-way handshake. 413 func (h *handshake) execute() *tcpip.Error { 414 if h.ep.route.IsResolutionRequired() { 415 if err := h.resolveRoute(); err != nil { 416 return err 417 } 418 } 419 420 // Initialize the resend timer. 421 resendWaker := sleep.Waker{} 422 timeOut := time.Duration(time.Second) 423 rt := time.AfterFunc(timeOut, func() { 424 resendWaker.Assert() 425 }) 426 defer rt.Stop() 427 428 // Set up the wakers. 429 s := sleep.Sleeper{} 430 s.AddWaker(&resendWaker, wakerForResend) 431 s.AddWaker(&h.ep.notificationWaker, wakerForNotification) 432 s.AddWaker(&h.ep.newSegmentWaker, wakerForNewSegment) 433 defer s.Done() 434 435 var sackEnabled SACKEnabled 436 if err := h.ep.stack.TransportProtocolOption(ProtocolNumber, &sackEnabled); err != nil { 437 // If stack returned an error when checking for SACKEnabled 438 // status then just default to switching off SACK negotiation. 439 sackEnabled = false 440 } 441 442 // Send the initial SYN segment and loop until the handshake is 443 // completed. 444 h.ep.amss = mssForRoute(&h.ep.route) 445 446 synOpts := header.TCPSynOptions{ 447 WS: h.rcvWndScale, 448 TS: true, 449 TSVal: h.ep.timestamp(), 450 TSEcr: h.ep.recentTS, 451 SACKPermitted: bool(sackEnabled), 452 MSS: h.ep.amss, 453 } 454 455 // Execute is also called in a listen context so we want to make sure we 456 // only send the TS/SACK option when we received the TS/SACK in the 457 // initial SYN. 458 if h.state == handshakeSynRcvd { 459 synOpts.TS = h.ep.sendTSOk 460 synOpts.SACKPermitted = h.ep.sackPermitted && bool(sackEnabled) 461 if h.sndWndScale < 0 { 462 // Disable window scaling if the peer did not send us 463 // the window scaling option. 464 synOpts.WS = -1 465 } 466 } 467 sendSynTCP(&h.ep.route, h.ep.id, h.ep.ttl, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts) 468 for h.state != handshakeCompleted { 469 switch index, _ := s.Fetch(true); index { 470 case wakerForResend: 471 timeOut *= 2 472 if timeOut > 60*time.Second { 473 return tcpip.ErrTimeout 474 } 475 rt.Reset(timeOut) 476 sendSynTCP(&h.ep.route, h.ep.id, h.ep.ttl, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts) 477 478 case wakerForNotification: 479 n := h.ep.fetchNotifications() 480 if n¬ifyClose != 0 { 481 return tcpip.ErrAborted 482 } 483 if n¬ifyDrain != 0 { 484 for !h.ep.segmentQueue.empty() { 485 s := h.ep.segmentQueue.dequeue() 486 err := h.handleSegment(s) 487 s.decRef() 488 if err != nil { 489 return err 490 } 491 if h.state == handshakeCompleted { 492 return nil 493 } 494 } 495 close(h.ep.drainDone) 496 <-h.ep.undrain 497 } 498 499 case wakerForNewSegment: 500 if err := h.processSegments(); err != nil { 501 return err 502 } 503 } 504 } 505 506 return nil 507 } 508 509 func parseSynSegmentOptions(s *segment) header.TCPSynOptions { 510 synOpts := header.ParseSynOptions(s.options, s.flagIsSet(header.TCPFlagAck)) 511 if synOpts.TS { 512 s.parsedOptions.TSVal = synOpts.TSVal 513 s.parsedOptions.TSEcr = synOpts.TSEcr 514 } 515 return synOpts 516 } 517 518 var optionPool = sync.Pool{ 519 New: func() interface{} { 520 return make([]byte, maxOptionSize) 521 }, 522 } 523 524 func getOptions() []byte { 525 return optionPool.Get().([]byte) 526 } 527 528 func putOptions(options []byte) { 529 // Reslice to full capacity. 530 optionPool.Put(options[0:cap(options)]) 531 } 532 533 func makeSynOptions(opts header.TCPSynOptions) []byte { 534 // Emulate linux option order. This is as follows: 535 // 536 // if md5: NOP NOP MD5SIG 18 md5sig(16) 537 // if mss: MSS 4 mss(2) 538 // if ts and sack_advertise: 539 // SACK 2 TIMESTAMP 2 timestamp(8) 540 // elif ts: NOP NOP TIMESTAMP 10 timestamp(8) 541 // elif sack: NOP NOP SACK 2 542 // if wscale: NOP WINDOW 3 ws(1) 543 // if sack_blocks: NOP NOP SACK ((2 + (#blocks * 8)) 544 // [for each block] start_seq(4) end_seq(4) 545 // if fastopen_cookie: 546 // if exp: EXP (4 + len(cookie)) FASTOPEN_MAGIC(2) 547 // else: FASTOPEN (2 + len(cookie)) 548 // cookie(variable) [padding to four bytes] 549 // 550 options := getOptions() 551 552 // Always encode the mss. 553 offset := header.EncodeMSSOption(uint32(opts.MSS), options) 554 555 // Special ordering is required here. If both TS and SACK are enabled, 556 // then the SACK option precedes TS, with no padding. If they are 557 // enabled individually, then we see padding before the option. 558 if opts.TS && opts.SACKPermitted { 559 offset += header.EncodeSACKPermittedOption(options[offset:]) 560 offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:]) 561 } else if opts.TS { 562 offset += header.EncodeNOP(options[offset:]) 563 offset += header.EncodeNOP(options[offset:]) 564 offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:]) 565 } else if opts.SACKPermitted { 566 offset += header.EncodeNOP(options[offset:]) 567 offset += header.EncodeNOP(options[offset:]) 568 offset += header.EncodeSACKPermittedOption(options[offset:]) 569 } 570 571 // Initialize the WS option. 572 if opts.WS >= 0 { 573 offset += header.EncodeNOP(options[offset:]) 574 offset += header.EncodeWSOption(opts.WS, options[offset:]) 575 } 576 577 // Padding to the end; note that this never apply unless we add a 578 // fastopen option, we always expect the offset to remain the same. 579 if delta := header.AddTCPOptionPadding(options, offset); delta != 0 { 580 panic("unexpected option encoding") 581 } 582 583 return options[:offset] 584 } 585 586 func sendSynTCP(r *stack.Route, id stack.TransportEndpointID, ttl uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts header.TCPSynOptions) *tcpip.Error { 587 options := makeSynOptions(opts) 588 err := sendTCP(r, id, buffer.VectorisedView{}, ttl, flags, seq, ack, rcvWnd, options, nil) 589 putOptions(options) 590 return err 591 } 592 593 // sendTCP sends a TCP segment with the provided options via the provided 594 // network endpoint and under the provided identity. 595 func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error { 596 optLen := len(opts) 597 // Allocate a buffer for the TCP header. 598 hdr := buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen) 599 600 if rcvWnd > 0xffff { 601 rcvWnd = 0xffff 602 } 603 604 // Initialize the header. 605 tcp := header.TCP(hdr.Prepend(header.TCPMinimumSize + optLen)) 606 tcp.Encode(&header.TCPFields{ 607 SrcPort: id.LocalPort, 608 DstPort: id.RemotePort, 609 SeqNum: uint32(seq), 610 AckNum: uint32(ack), 611 DataOffset: uint8(header.TCPMinimumSize + optLen), 612 Flags: flags, 613 WindowSize: uint16(rcvWnd), 614 }) 615 copy(tcp[header.TCPMinimumSize:], opts) 616 617 length := uint16(hdr.UsedLength() + data.Size()) 618 xsum := r.PseudoHeaderChecksum(ProtocolNumber, length) 619 // Only calculate the checksum if offloading isn't supported. 620 if gso != nil && gso.NeedsCsum { 621 // This is called CHECKSUM_PARTIAL in the Linux kernel. We 622 // calculate a checksum of the pseudo-header and save it in the 623 // TCP header, then the kernel calculate a checksum of the 624 // header and data and get the right sum of the TCP packet. 625 tcp.SetChecksum(xsum) 626 } else if r.Capabilities()&stack.CapabilityTXChecksumOffload == 0 { 627 xsum = header.ChecksumVV(data, xsum) 628 tcp.SetChecksum(^tcp.CalculateChecksum(xsum)) 629 } 630 631 r.Stats().TCP.SegmentsSent.Increment() 632 if (flags & header.TCPFlagRst) != 0 { 633 r.Stats().TCP.ResetsSent.Increment() 634 } 635 636 return r.WritePacket(gso, hdr, data, ProtocolNumber, ttl, ttl == 0 /* useDefaultTTL */) 637 } 638 639 // makeOptions makes an options slice. 640 func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte { 641 options := getOptions() 642 offset := 0 643 644 // N.B. the ordering here matches the ordering used by Linux internally 645 // and described in the raw makeOptions function. We don't include 646 // unnecessary cases here (post connection.) 647 if e.sendTSOk { 648 // Embed the timestamp if timestamp has been enabled. 649 // 650 // We only use the lower 32 bits of the unix time in 651 // milliseconds. This is similar to what Linux does where it 652 // uses the lower 32 bits of the jiffies value in the tsVal 653 // field of the timestamp option. 654 // 655 // Further, RFC7323 section-5.4 recommends millisecond 656 // resolution as the lowest recommended resolution for the 657 // timestamp clock. 658 // 659 // Ref: https://tools.ietf.org/html/rfc7323#section-5.4. 660 offset += header.EncodeNOP(options[offset:]) 661 offset += header.EncodeNOP(options[offset:]) 662 offset += header.EncodeTSOption(e.timestamp(), uint32(e.recentTS), options[offset:]) 663 } 664 if e.sackPermitted && len(sackBlocks) > 0 { 665 offset += header.EncodeNOP(options[offset:]) 666 offset += header.EncodeNOP(options[offset:]) 667 offset += header.EncodeSACKBlocks(sackBlocks, options[offset:]) 668 } 669 670 // We expect the above to produce an aligned offset. 671 if delta := header.AddTCPOptionPadding(options, offset); delta != 0 { 672 panic("unexpected option encoding") 673 } 674 675 return options[:offset] 676 } 677 678 // sendRaw sends a TCP segment to the endpoint's peer. 679 func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size) *tcpip.Error { 680 var sackBlocks []header.SACKBlock 681 if e.state == StateEstablished && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) { 682 sackBlocks = e.sack.Blocks[:e.sack.NumBlocks] 683 } 684 options := e.makeOptions(sackBlocks) 685 err := sendTCP(&e.route, e.id, data, e.ttl, flags, seq, ack, rcvWnd, options, e.gso) 686 putOptions(options) 687 return err 688 } 689 690 func (e *endpoint) handleWrite() *tcpip.Error { 691 // Move packets from send queue to send list. The queue is accessible 692 // from other goroutines and protected by the send mutex, while the send 693 // list is only accessible from the handler goroutine, so it needs no 694 // mutexes. 695 e.sndBufMu.Lock() 696 697 first := e.sndQueue.Front() 698 if first != nil { 699 e.snd.writeList.PushBackList(&e.sndQueue) 700 e.snd.sndNxtList.UpdateForward(e.sndBufInQueue) 701 e.sndBufInQueue = 0 702 } 703 704 e.sndBufMu.Unlock() 705 706 // Initialize the next segment to write if it's currently nil. 707 if e.snd.writeNext == nil { 708 e.snd.writeNext = first 709 } 710 711 // Push out any new packets. 712 e.snd.sendData() 713 714 return nil 715 } 716 717 func (e *endpoint) handleClose() *tcpip.Error { 718 // Drain the send queue. 719 e.handleWrite() 720 721 // Mark send side as closed. 722 e.snd.closed = true 723 724 return nil 725 } 726 727 // resetConnectionLocked puts the endpoint in an error state with the given 728 // error code and sends a RST if and only if the error is not ErrConnectionReset 729 // indicating that the connection is being reset due to receiving a RST. This 730 // method must only be called from the protocol goroutine. 731 func (e *endpoint) resetConnectionLocked(err *tcpip.Error) { 732 // Only send a reset if the connection is being aborted for a reason 733 // other than receiving a reset. 734 e.state = StateError 735 e.hardError = err 736 if err != tcpip.ErrConnectionReset { 737 e.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck|header.TCPFlagRst, e.snd.sndUna, e.rcv.rcvNxt, 0) 738 } 739 } 740 741 // completeWorkerLocked is called by the worker goroutine when it's about to 742 // exit. It marks the worker as completed and performs cleanup work if requested 743 // by Close(). 744 func (e *endpoint) completeWorkerLocked() { 745 e.workerRunning = false 746 if e.workerCleanup { 747 e.cleanupLocked() 748 } 749 } 750 751 // handleSegments pulls segments from the queue and processes them. It returns 752 // no error if the protocol loop should continue, an error otherwise. 753 func (e *endpoint) handleSegments() *tcpip.Error { 754 checkRequeue := true 755 for i := 0; i < maxSegmentsPerWake; i++ { 756 s := e.segmentQueue.dequeue() 757 if s == nil { 758 checkRequeue = false 759 break 760 } 761 762 // Invoke the tcp probe if installed. 763 if e.probe != nil { 764 e.probe(e.completeState()) 765 } 766 767 if s.flagIsSet(header.TCPFlagRst) { 768 if e.rcv.acceptable(s.sequenceNumber, 0) { 769 // RFC 793, page 37 states that "in all states 770 // except SYN-SENT, all reset (RST) segments are 771 // validated by checking their SEQ-fields." So 772 // we only process it if it's acceptable. 773 s.decRef() 774 return tcpip.ErrConnectionReset 775 } 776 } else if s.flagIsSet(header.TCPFlagAck) { 777 // Patch the window size in the segment according to the 778 // send window scale. 779 s.window <<= e.snd.sndWndScale 780 781 // RFC 793, page 41 states that "once in the ESTABLISHED 782 // state all segments must carry current acknowledgment 783 // information." 784 e.rcv.handleRcvdSegment(s) 785 e.snd.handleRcvdSegment(s) 786 } 787 s.decRef() 788 } 789 790 // If the queue is not empty, make sure we'll wake up in the next 791 // iteration. 792 if checkRequeue && !e.segmentQueue.empty() { 793 e.newSegmentWaker.Assert() 794 } 795 796 // Send an ACK for all processed packets if needed. 797 if e.rcv.rcvNxt != e.snd.maxSentAck { 798 e.snd.sendAck() 799 } 800 801 e.resetKeepaliveTimer(true) 802 803 return nil 804 } 805 806 // keepaliveTimerExpired is called when the keepaliveTimer fires. We send TCP 807 // keepalive packets periodically when the connection is idle. If we don't hear 808 // from the other side after a number of tries, we terminate the connection. 809 func (e *endpoint) keepaliveTimerExpired() *tcpip.Error { 810 e.keepalive.Lock() 811 if !e.keepalive.enabled || !e.keepalive.timer.checkExpiration() { 812 e.keepalive.Unlock() 813 return nil 814 } 815 816 if e.keepalive.unacked >= e.keepalive.count { 817 e.keepalive.Unlock() 818 return tcpip.ErrTimeout 819 } 820 821 // RFC1122 4.2.3.6: TCP keepalive is a dataless ACK with 822 // seg.seq = snd.nxt-1. 823 e.keepalive.unacked++ 824 e.keepalive.Unlock() 825 e.snd.sendSegmentFromView(buffer.VectorisedView{}, header.TCPFlagAck, e.snd.sndNxt-1) 826 e.resetKeepaliveTimer(false) 827 return nil 828 } 829 830 // resetKeepaliveTimer restarts or stops the keepalive timer, depending on 831 // whether it is enabled for this endpoint. 832 func (e *endpoint) resetKeepaliveTimer(receivedData bool) { 833 e.keepalive.Lock() 834 defer e.keepalive.Unlock() 835 if receivedData { 836 e.keepalive.unacked = 0 837 } 838 // Start the keepalive timer IFF it's enabled and there is no pending 839 // data to send. 840 if !e.keepalive.enabled || e.snd == nil || e.snd.sndUna != e.snd.sndNxt { 841 e.keepalive.timer.disable() 842 return 843 } 844 if e.keepalive.unacked > 0 { 845 e.keepalive.timer.enable(e.keepalive.interval) 846 } else { 847 e.keepalive.timer.enable(e.keepalive.idle) 848 } 849 } 850 851 // disableKeepaliveTimer stops the keepalive timer. 852 func (e *endpoint) disableKeepaliveTimer() { 853 e.keepalive.Lock() 854 e.keepalive.timer.disable() 855 e.keepalive.Unlock() 856 } 857 858 // protocolMainLoop is the main loop of the TCP protocol. It runs in its own 859 // goroutine and is responsible for sending segments and handling received 860 // segments. 861 func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error { 862 var closeTimer *time.Timer 863 var closeWaker sleep.Waker 864 865 epilogue := func() { 866 // e.mu is expected to be hold upon entering this section. 867 868 if e.snd != nil { 869 e.snd.resendTimer.cleanup() 870 } 871 872 if closeTimer != nil { 873 closeTimer.Stop() 874 } 875 876 e.completeWorkerLocked() 877 878 if e.drainDone != nil { 879 close(e.drainDone) 880 } 881 882 e.mu.Unlock() 883 884 // When the protocol loop exits we should wake up our waiters. 885 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut) 886 } 887 888 if handshake { 889 // This is an active connection, so we must initiate the 3-way 890 // handshake, and then inform potential waiters about its 891 // completion. 892 initialRcvWnd := e.initialReceiveWindow() 893 h := newHandshake(e, seqnum.Size(initialRcvWnd)) 894 e.mu.Lock() 895 h.ep.state = StateSynSent 896 e.mu.Unlock() 897 898 if err := h.execute(); err != nil { 899 e.lastErrorMu.Lock() 900 e.lastError = err 901 e.lastErrorMu.Unlock() 902 903 e.mu.Lock() 904 e.state = StateError 905 e.hardError = err 906 907 // Lock released below. 908 epilogue() 909 910 return err 911 } 912 913 // Transfer handshake state to TCP connection. We disable 914 // receive window scaling if the peer doesn't support it 915 // (indicated by a negative send window scale). 916 e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale) 917 918 rcvBufSize := seqnum.Size(e.receiveBufferSize()) 919 e.rcvListMu.Lock() 920 e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale(), rcvBufSize) 921 // boot strap the auto tuning algorithm. Starting at zero will 922 // result in a large step function on the first proper causing 923 // the window to just go to a really large value after the first 924 // RTT itself. 925 e.rcvAutoParams.prevCopied = initialRcvWnd 926 e.rcvListMu.Unlock() 927 } 928 929 e.keepalive.timer.init(&e.keepalive.waker) 930 defer e.keepalive.timer.cleanup() 931 932 // Tell waiters that the endpoint is connected and writable. 933 e.mu.Lock() 934 e.state = StateEstablished 935 drained := e.drainDone != nil 936 e.mu.Unlock() 937 if drained { 938 close(e.drainDone) 939 <-e.undrain 940 } 941 942 e.waiterQueue.Notify(waiter.EventOut) 943 944 // Set up the functions that will be called when the main protocol loop 945 // wakes up. 946 funcs := []struct { 947 w *sleep.Waker 948 f func() *tcpip.Error 949 }{ 950 { 951 w: &e.sndWaker, 952 f: e.handleWrite, 953 }, 954 { 955 w: &e.sndCloseWaker, 956 f: e.handleClose, 957 }, 958 { 959 w: &e.newSegmentWaker, 960 f: e.handleSegments, 961 }, 962 { 963 w: &closeWaker, 964 f: func() *tcpip.Error { 965 return tcpip.ErrConnectionAborted 966 }, 967 }, 968 { 969 w: &e.snd.resendWaker, 970 f: func() *tcpip.Error { 971 if !e.snd.retransmitTimerExpired() { 972 return tcpip.ErrTimeout 973 } 974 return nil 975 }, 976 }, 977 { 978 w: &e.keepalive.waker, 979 f: e.keepaliveTimerExpired, 980 }, 981 { 982 w: &e.notificationWaker, 983 f: func() *tcpip.Error { 984 n := e.fetchNotifications() 985 if n¬ifyNonZeroReceiveWindow != 0 { 986 e.rcv.nonZeroWindow() 987 } 988 989 if n¬ifyReceiveWindowChanged != 0 { 990 e.rcv.pendingBufSize = seqnum.Size(e.receiveBufferSize()) 991 } 992 993 if n¬ifyMTUChanged != 0 { 994 e.sndBufMu.Lock() 995 count := e.packetTooBigCount 996 e.packetTooBigCount = 0 997 mtu := e.sndMTU 998 e.sndBufMu.Unlock() 999 1000 e.snd.updateMaxPayloadSize(mtu, count) 1001 } 1002 1003 if n¬ifyReset != 0 { 1004 e.mu.Lock() 1005 e.resetConnectionLocked(tcpip.ErrConnectionAborted) 1006 e.mu.Unlock() 1007 } 1008 if n¬ifyClose != 0 && closeTimer == nil { 1009 // Reset the connection 3 seconds after 1010 // the endpoint has been closed. 1011 // 1012 // The timer could fire in background 1013 // when the endpoint is drained. That's 1014 // OK as the loop here will not honor 1015 // the firing until the undrain arrives. 1016 closeTimer = time.AfterFunc(3*time.Second, func() { 1017 closeWaker.Assert() 1018 }) 1019 } 1020 1021 if n¬ifyKeepaliveChanged != 0 { 1022 // The timer could fire in background 1023 // when the endpoint is drained. That's 1024 // OK. See above. 1025 e.resetKeepaliveTimer(true) 1026 } 1027 1028 if n¬ifyDrain != 0 { 1029 for !e.segmentQueue.empty() { 1030 if err := e.handleSegments(); err != nil { 1031 return err 1032 } 1033 } 1034 if e.state != StateError { 1035 close(e.drainDone) 1036 <-e.undrain 1037 } 1038 } 1039 1040 return nil 1041 }, 1042 }, 1043 } 1044 1045 // Initialize the sleeper based on the wakers in funcs. 1046 s := sleep.Sleeper{} 1047 for i := range funcs { 1048 s.AddWaker(funcs[i].w, i) 1049 } 1050 1051 // The following assertions and notifications are needed for restored 1052 // endpoints. Fresh newly created endpoints have empty states and should 1053 // not invoke any. 1054 e.segmentQueue.mu.Lock() 1055 if !e.segmentQueue.list.Empty() { 1056 e.newSegmentWaker.Assert() 1057 } 1058 e.segmentQueue.mu.Unlock() 1059 1060 e.rcvListMu.Lock() 1061 if !e.rcvList.Empty() { 1062 e.waiterQueue.Notify(waiter.EventIn) 1063 } 1064 e.rcvListMu.Unlock() 1065 1066 e.mu.RLock() 1067 if e.workerCleanup { 1068 e.notifyProtocolGoroutine(notifyClose) 1069 } 1070 e.mu.RUnlock() 1071 1072 // Main loop. Handle segments until both send and receive ends of the 1073 // connection have completed. 1074 for !e.rcv.closed || !e.snd.closed || e.snd.sndUna != e.snd.sndNxtList { 1075 e.workMu.Unlock() 1076 v, _ := s.Fetch(true) 1077 e.workMu.Lock() 1078 if err := funcs[v].f(); err != nil { 1079 e.mu.Lock() 1080 // Ensure we release all endpoint registration and route 1081 // references as the connection is now in an error 1082 // state. 1083 e.workerCleanup = true 1084 e.resetConnectionLocked(err) 1085 // Lock released below. 1086 epilogue() 1087 1088 return nil 1089 } 1090 } 1091 1092 // Mark endpoint as closed. 1093 e.mu.Lock() 1094 if e.state != StateError { 1095 e.state = StateClose 1096 } 1097 // Lock released below. 1098 epilogue() 1099 1100 return nil 1101 }