github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/tcpip/transport/tcp/accept.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "crypto/sha1" 19 "encoding/binary" 20 "fmt" 21 "hash" 22 "io" 23 "sync/atomic" 24 "time" 25 26 "github.com/SagerNet/gvisor/pkg/sleep" 27 "github.com/SagerNet/gvisor/pkg/sync" 28 "github.com/SagerNet/gvisor/pkg/tcpip" 29 "github.com/SagerNet/gvisor/pkg/tcpip/header" 30 "github.com/SagerNet/gvisor/pkg/tcpip/ports" 31 "github.com/SagerNet/gvisor/pkg/tcpip/seqnum" 32 "github.com/SagerNet/gvisor/pkg/tcpip/stack" 33 "github.com/SagerNet/gvisor/pkg/waiter" 34 ) 35 36 const ( 37 // tsLen is the length, in bits, of the timestamp in the SYN cookie. 38 tsLen = 8 39 40 // tsMask is a mask for timestamp values (i.e., tsLen bits). 41 tsMask = (1 << tsLen) - 1 42 43 // tsOffset is the offset, in bits, of the timestamp in the SYN cookie. 44 tsOffset = 24 45 46 // hashMask is the mask for hash values (i.e., tsOffset bits). 47 hashMask = (1 << tsOffset) - 1 48 49 // maxTSDiff is the maximum allowed difference between a received cookie 50 // timestamp and the current timestamp. If the difference is greater 51 // than maxTSDiff, the cookie is expired. 52 maxTSDiff = 2 53 ) 54 55 var ( 56 // mssTable is a slice containing the possible MSS values that we 57 // encode in the SYN cookie with two bits. 58 mssTable = []uint16{536, 1300, 1440, 1460} 59 ) 60 61 func encodeMSS(mss uint16) uint32 { 62 for i := len(mssTable) - 1; i > 0; i-- { 63 if mss >= mssTable[i] { 64 return uint32(i) 65 } 66 } 67 return 0 68 } 69 70 // listenContext is used by a listening endpoint to store state used while 71 // listening for connections. This struct is allocated by the listen goroutine 72 // and must not be accessed or have its methods called concurrently as they 73 // may mutate the stored objects. 74 type listenContext struct { 75 stack *stack.Stack 76 77 // rcvWnd is the receive window that is sent by this listening context 78 // in the initial SYN-ACK. 79 rcvWnd seqnum.Size 80 81 // nonce are random bytes that are initialized once when the context 82 // is created and used to seed the hash function when generating 83 // the SYN cookie. 84 nonce [2][sha1.BlockSize]byte 85 86 // listenEP is a reference to the listening endpoint associated with 87 // this context. Can be nil if the context is created by the forwarder. 88 listenEP *endpoint 89 90 // hasherMu protects hasher. 91 hasherMu sync.Mutex 92 // hasher is the hash function used to generate a SYN cookie. 93 hasher hash.Hash 94 95 // v6Only is true if listenEP is a dual stack socket and has the 96 // IPV6_V6ONLY option set. 97 v6Only bool 98 99 // netProto indicates the network protocol(IPv4/v6) for the listening 100 // endpoint. 101 netProto tcpip.NetworkProtocolNumber 102 103 // pendingMu protects pendingEndpoints. This should only be accessed 104 // by the listening endpoint's worker goroutine. 105 // 106 // Lock Ordering: listenEP.workerMu -> pendingMu 107 pendingMu sync.Mutex 108 // pending is used to wait for all pendingEndpoints to finish when 109 // a socket is closed. 110 pending sync.WaitGroup 111 // pendingEndpoints is a map of all endpoints for which a handshake is 112 // in progress. 113 pendingEndpoints map[stack.TransportEndpointID]*endpoint 114 } 115 116 // timeStamp returns an 8-bit timestamp with a granularity of 64 seconds. 117 func timeStamp(clock tcpip.Clock) uint32 { 118 return uint32(clock.NowMonotonic().Sub(tcpip.MonotonicTime{}).Seconds()) >> 6 & tsMask 119 } 120 121 // newListenContext creates a new listen context. 122 func newListenContext(stk *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size, v6Only bool, netProto tcpip.NetworkProtocolNumber) *listenContext { 123 l := &listenContext{ 124 stack: stk, 125 rcvWnd: rcvWnd, 126 hasher: sha1.New(), 127 v6Only: v6Only, 128 netProto: netProto, 129 listenEP: listenEP, 130 pendingEndpoints: make(map[stack.TransportEndpointID]*endpoint), 131 } 132 133 for i := range l.nonce { 134 if _, err := io.ReadFull(stk.SecureRNG(), l.nonce[i][:]); err != nil { 135 panic(err) 136 } 137 } 138 139 return l 140 } 141 142 // cookieHash calculates the cookieHash for the given id, timestamp and nonce 143 // index. The hash is used to create and validate cookies. 144 func (l *listenContext) cookieHash(id stack.TransportEndpointID, ts uint32, nonceIndex int) uint32 { 145 146 // Initialize block with fixed-size data: local ports and v. 147 var payload [8]byte 148 binary.BigEndian.PutUint16(payload[0:], id.LocalPort) 149 binary.BigEndian.PutUint16(payload[2:], id.RemotePort) 150 binary.BigEndian.PutUint32(payload[4:], ts) 151 152 // Feed everything to the hasher. 153 l.hasherMu.Lock() 154 l.hasher.Reset() 155 156 // Per hash.Hash.Writer: 157 // 158 // It never returns an error. 159 l.hasher.Write(payload[:]) 160 l.hasher.Write(l.nonce[nonceIndex][:]) 161 l.hasher.Write([]byte(id.LocalAddress)) 162 l.hasher.Write([]byte(id.RemoteAddress)) 163 164 // Finalize the calculation of the hash and return the first 4 bytes. 165 h := l.hasher.Sum(nil) 166 l.hasherMu.Unlock() 167 168 return binary.BigEndian.Uint32(h[:]) 169 } 170 171 // createCookie creates a SYN cookie for the given id and incoming sequence 172 // number. 173 func (l *listenContext) createCookie(id stack.TransportEndpointID, seq seqnum.Value, data uint32) seqnum.Value { 174 ts := timeStamp(l.stack.Clock()) 175 v := l.cookieHash(id, 0, 0) + uint32(seq) + (ts << tsOffset) 176 v += (l.cookieHash(id, ts, 1) + data) & hashMask 177 return seqnum.Value(v) 178 } 179 180 // isCookieValid checks if the supplied cookie is valid for the given id and 181 // sequence number. If it is, it also returns the data originally encoded in the 182 // cookie when createCookie was called. 183 func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnum.Value, seq seqnum.Value) (uint32, bool) { 184 ts := timeStamp(l.stack.Clock()) 185 v := uint32(cookie) - l.cookieHash(id, 0, 0) - uint32(seq) 186 cookieTS := v >> tsOffset 187 if ((ts - cookieTS) & tsMask) > maxTSDiff { 188 return 0, false 189 } 190 191 return (v - l.cookieHash(id, cookieTS, 1)) & hashMask, true 192 } 193 194 func (l *listenContext) useSynCookies() bool { 195 var alwaysUseSynCookies tcpip.TCPAlwaysUseSynCookies 196 if err := l.stack.TransportProtocolOption(header.TCPProtocolNumber, &alwaysUseSynCookies); err != nil { 197 panic(fmt.Sprintf("TransportProtocolOption(%d, %T) = %s", header.TCPProtocolNumber, alwaysUseSynCookies, err)) 198 } 199 return bool(alwaysUseSynCookies) || (l.listenEP != nil && l.listenEP.synRcvdBacklogFull()) 200 } 201 202 // createConnectingEndpoint creates a new endpoint in a connecting state, with 203 // the connection parameters given by the arguments. 204 func (l *listenContext) createConnectingEndpoint(s *segment, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, tcpip.Error) { 205 // Create a new endpoint. 206 netProto := l.netProto 207 if netProto == 0 { 208 netProto = s.netProto 209 } 210 211 route, err := l.stack.FindRoute(s.nicID, s.dstAddr, s.srcAddr, s.netProto, false /* multicastLoop */) 212 if err != nil { 213 return nil, err 214 } 215 216 n := newEndpoint(l.stack, netProto, queue) 217 n.ops.SetV6Only(l.v6Only) 218 n.TransportEndpointInfo.ID = s.id 219 n.boundNICID = s.nicID 220 n.route = route 221 n.effectiveNetProtos = []tcpip.NetworkProtocolNumber{s.netProto} 222 n.ops.SetReceiveBufferSize(int64(l.rcvWnd), false /* notify */) 223 n.amss = calculateAdvertisedMSS(n.userMSS, n.route) 224 n.setEndpointState(StateConnecting) 225 226 n.maybeEnableTimestamp(rcvdSynOpts) 227 n.maybeEnableSACKPermitted(rcvdSynOpts) 228 229 n.initGSO() 230 231 // Bootstrap the auto tuning algorithm. Starting at zero will result in 232 // a large step function on the first window adjustment causing the 233 // window to grow to a really large value. 234 n.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes = n.initialReceiveWindow() 235 236 return n, nil 237 } 238 239 // startHandshake creates a new endpoint in connecting state and then sends 240 // the SYN-ACK for the TCP 3-way handshake. It returns the state of the 241 // handshake in progress, which includes the new endpoint in the SYN-RCVD 242 // state. 243 // 244 // On success, a handshake h is returned with h.ep.mu held. 245 // 246 // Precondition: if l.listenEP != nil, l.listenEP.mu must be locked. 247 func (l *listenContext) startHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue, owner tcpip.PacketOwner) (*handshake, tcpip.Error) { 248 // Create new endpoint. 249 irs := s.sequenceNumber 250 isn := generateSecureISN(s.id, l.stack.Clock(), l.stack.Seed()) 251 ep, err := l.createConnectingEndpoint(s, opts, queue) 252 if err != nil { 253 return nil, err 254 } 255 256 // Lock the endpoint before registering to ensure that no out of 257 // band changes are possible due to incoming packets etc till 258 // the endpoint is done initializing. 259 ep.mu.Lock() 260 ep.owner = owner 261 262 // listenEP is nil when listenContext is used by tcp.Forwarder. 263 deferAccept := time.Duration(0) 264 if l.listenEP != nil { 265 if l.listenEP.EndpointState() != StateListen { 266 267 // Ensure we release any registrations done by the newly 268 // created endpoint. 269 ep.mu.Unlock() 270 ep.Close() 271 272 return nil, &tcpip.ErrConnectionAborted{} 273 } 274 l.addPendingEndpoint(ep) 275 276 // Propagate any inheritable options from the listening endpoint 277 // to the newly created endpoint. 278 l.listenEP.propagateInheritableOptionsLocked(ep) 279 280 if !ep.reserveTupleLocked() { 281 ep.mu.Unlock() 282 ep.Close() 283 284 l.removePendingEndpoint(ep) 285 286 return nil, &tcpip.ErrConnectionAborted{} 287 } 288 289 deferAccept = l.listenEP.deferAccept 290 } 291 292 // Register new endpoint so that packets are routed to it. 293 if err := ep.stack.RegisterTransportEndpoint( 294 ep.effectiveNetProtos, 295 ProtocolNumber, 296 ep.TransportEndpointInfo.ID, 297 ep, 298 ep.boundPortFlags, 299 ep.boundBindToDevice, 300 ); err != nil { 301 ep.mu.Unlock() 302 ep.Close() 303 304 if l.listenEP != nil { 305 l.removePendingEndpoint(ep) 306 } 307 308 ep.drainClosingSegmentQueue() 309 310 return nil, err 311 } 312 313 ep.isRegistered = true 314 315 // Initialize and start the handshake. 316 h := ep.newPassiveHandshake(isn, irs, opts, deferAccept) 317 h.listenEP = l.listenEP 318 h.start() 319 return h, nil 320 } 321 322 // performHandshake performs a TCP 3-way handshake. On success, the new 323 // established endpoint is returned with e.mu held. 324 // 325 // Precondition: if l.listenEP != nil, l.listenEP.mu must be locked. 326 func (l *listenContext) performHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue, owner tcpip.PacketOwner) (*endpoint, tcpip.Error) { 327 h, err := l.startHandshake(s, opts, queue, owner) 328 if err != nil { 329 return nil, err 330 } 331 ep := h.ep 332 333 // N.B. the endpoint is generated above by startHandshake, and will be 334 // returned locked. This first call is forced. 335 if err := h.complete(); err != nil { // +checklocksforce 336 ep.stack.Stats().TCP.FailedConnectionAttempts.Increment() 337 ep.stats.FailedConnectionAttempts.Increment() 338 l.cleanupFailedHandshake(h) 339 return nil, err 340 } 341 l.cleanupCompletedHandshake(h) 342 return ep, nil 343 } 344 345 func (l *listenContext) addPendingEndpoint(n *endpoint) { 346 l.pendingMu.Lock() 347 l.pendingEndpoints[n.TransportEndpointInfo.ID] = n 348 l.pending.Add(1) 349 l.pendingMu.Unlock() 350 } 351 352 func (l *listenContext) removePendingEndpoint(n *endpoint) { 353 l.pendingMu.Lock() 354 delete(l.pendingEndpoints, n.TransportEndpointInfo.ID) 355 l.pending.Done() 356 l.pendingMu.Unlock() 357 } 358 359 func (l *listenContext) closeAllPendingEndpoints() { 360 l.pendingMu.Lock() 361 for _, n := range l.pendingEndpoints { 362 n.notifyProtocolGoroutine(notifyClose) 363 } 364 l.pendingMu.Unlock() 365 l.pending.Wait() 366 } 367 368 // Precondition: h.ep.mu must be held. 369 // +checklocks:h.ep.mu 370 func (l *listenContext) cleanupFailedHandshake(h *handshake) { 371 e := h.ep 372 e.mu.Unlock() 373 e.Close() 374 e.notifyAborted() 375 if l.listenEP != nil { 376 l.removePendingEndpoint(e) 377 } 378 e.drainClosingSegmentQueue() 379 e.h = nil 380 } 381 382 // cleanupCompletedHandshake transfers any state from the completed handshake to 383 // the new endpoint. 384 // 385 // Precondition: h.ep.mu must be held. 386 func (l *listenContext) cleanupCompletedHandshake(h *handshake) { 387 e := h.ep 388 if l.listenEP != nil { 389 l.removePendingEndpoint(e) 390 } 391 e.isConnectNotified = true 392 393 // Update the receive window scaling. We can't do it before the 394 // handshake because it's possible that the peer doesn't support window 395 // scaling. 396 e.rcv.RcvWndScale = e.h.effectiveRcvWndScale() 397 398 // Clean up handshake state stored in the endpoint so that it can be GCed. 399 e.h = nil 400 } 401 402 // deliverAccepted delivers the newly-accepted endpoint to the listener. If the 403 // listener has transitioned out of the listen state (accepted is the zero 404 // value), the new endpoint is reset instead. 405 func (e *endpoint) deliverAccepted(n *endpoint, withSynCookie bool) { 406 e.mu.Lock() 407 e.pendingAccepted.Add(1) 408 e.mu.Unlock() 409 defer e.pendingAccepted.Done() 410 411 // Drop the lock before notifying to avoid deadlock in user-specified 412 // callbacks. 413 delivered := func() bool { 414 e.acceptMu.Lock() 415 defer e.acceptMu.Unlock() 416 for { 417 if e.accepted == (accepted{}) { 418 return false 419 } 420 if e.accepted.endpoints.Len() == e.accepted.cap { 421 e.acceptCond.Wait() 422 continue 423 } 424 425 e.accepted.endpoints.PushBack(n) 426 if !withSynCookie { 427 atomic.AddInt32(&e.synRcvdCount, -1) 428 } 429 return true 430 } 431 }() 432 if delivered { 433 e.waiterQueue.Notify(waiter.ReadableEvents) 434 } else { 435 n.notifyProtocolGoroutine(notifyReset) 436 } 437 } 438 439 // propagateInheritableOptionsLocked propagates any options set on the listening 440 // endpoint to the newly created endpoint. 441 // 442 // Precondition: e.mu and n.mu must be held. 443 func (e *endpoint) propagateInheritableOptionsLocked(n *endpoint) { 444 n.userTimeout = e.userTimeout 445 n.portFlags = e.portFlags 446 n.boundBindToDevice = e.boundBindToDevice 447 n.boundPortFlags = e.boundPortFlags 448 n.userMSS = e.userMSS 449 } 450 451 // reserveTupleLocked reserves an accepted endpoint's tuple. 452 // 453 // Preconditions: 454 // * propagateInheritableOptionsLocked has been called. 455 // * e.mu is held. 456 func (e *endpoint) reserveTupleLocked() bool { 457 dest := tcpip.FullAddress{ 458 Addr: e.TransportEndpointInfo.ID.RemoteAddress, 459 Port: e.TransportEndpointInfo.ID.RemotePort, 460 } 461 portRes := ports.Reservation{ 462 Networks: e.effectiveNetProtos, 463 Transport: ProtocolNumber, 464 Addr: e.TransportEndpointInfo.ID.LocalAddress, 465 Port: e.TransportEndpointInfo.ID.LocalPort, 466 Flags: e.boundPortFlags, 467 BindToDevice: e.boundBindToDevice, 468 Dest: dest, 469 } 470 if !e.stack.ReserveTuple(portRes) { 471 e.stack.Stats().TCP.FailedPortReservations.Increment() 472 return false 473 } 474 475 e.isPortReserved = true 476 e.boundDest = dest 477 return true 478 } 479 480 // notifyAborted wakes up any waiters on registered, but not accepted 481 // endpoints. 482 // 483 // This is strictly not required normally as a socket that was never accepted 484 // can't really have any registered waiters except when stack.Wait() is called 485 // which waits for all registered endpoints to stop and expects an EventHUp. 486 func (e *endpoint) notifyAborted() { 487 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 488 } 489 490 // handleSynSegment is called in its own goroutine once the listening endpoint 491 // receives a SYN segment. It is responsible for completing the handshake and 492 // queueing the new endpoint for acceptance. 493 // 494 // A limited number of these goroutines are allowed before TCP starts using SYN 495 // cookies to accept connections. 496 // 497 // Precondition: if ctx.listenEP != nil, ctx.listenEP.mu must be locked. 498 func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header.TCPSynOptions) tcpip.Error { 499 defer s.decRef() 500 501 h, err := ctx.startHandshake(s, opts, &waiter.Queue{}, e.owner) 502 if err != nil { 503 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 504 e.stats.FailedConnectionAttempts.Increment() 505 atomic.AddInt32(&e.synRcvdCount, -1) 506 return err 507 } 508 509 go func() { 510 // Note that startHandshake returns a locked endpoint. The 511 // force call here just makes it so. 512 if err := h.complete(); err != nil { // +checklocksforce 513 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 514 e.stats.FailedConnectionAttempts.Increment() 515 ctx.cleanupFailedHandshake(h) 516 atomic.AddInt32(&e.synRcvdCount, -1) 517 return 518 } 519 ctx.cleanupCompletedHandshake(h) 520 h.ep.startAcceptedLoop() 521 e.stack.Stats().TCP.PassiveConnectionOpenings.Increment() 522 e.deliverAccepted(h.ep, false /*withSynCookie*/) 523 }() 524 525 return nil 526 } 527 528 func (e *endpoint) synRcvdBacklogFull() bool { 529 e.acceptMu.Lock() 530 acceptedCap := e.accepted.cap 531 e.acceptMu.Unlock() 532 // The capacity of the accepted queue would always be one greater than the 533 // listen backlog. But, the SYNRCVD connections count is always checked 534 // against the listen backlog value for Linux parity reason. 535 // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/inet_connection_sock.h#L280 536 // 537 // We maintain an equality check here as the synRcvdCount is incremented 538 // and compared only from a single listener context and the capacity of 539 // the accepted queue can only increase by a new listen call. 540 return int(atomic.LoadInt32(&e.synRcvdCount)) == acceptedCap-1 541 } 542 543 func (e *endpoint) acceptQueueIsFull() bool { 544 e.acceptMu.Lock() 545 full := e.accepted != (accepted{}) && e.accepted.endpoints.Len() == e.accepted.cap 546 e.acceptMu.Unlock() 547 return full 548 } 549 550 // handleListenSegment is called when a listening endpoint receives a segment 551 // and needs to handle it. 552 // 553 // Precondition: if ctx.listenEP != nil, ctx.listenEP.mu must be locked. 554 func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Error { 555 e.rcvQueueInfo.rcvQueueMu.Lock() 556 rcvClosed := e.rcvQueueInfo.RcvClosed 557 e.rcvQueueInfo.rcvQueueMu.Unlock() 558 if rcvClosed || s.flags.Contains(header.TCPFlagSyn|header.TCPFlagAck) { 559 // If the endpoint is shutdown, reply with reset. 560 // 561 // RFC 793 section 3.4 page 35 (figure 12) outlines that a RST 562 // must be sent in response to a SYN-ACK while in the listen 563 // state to prevent completing a handshake from an old SYN. 564 return replyWithReset(e.stack, s, e.sendTOS, e.ttl) 565 } 566 567 switch { 568 case s.flags.Contains(header.TCPFlagRst): 569 e.stack.Stats().DroppedPackets.Increment() 570 return nil 571 572 case s.flags == header.TCPFlagSyn: 573 if e.acceptQueueIsFull() { 574 e.stack.Stats().TCP.ListenOverflowSynDrop.Increment() 575 e.stats.ReceiveErrors.ListenOverflowSynDrop.Increment() 576 e.stack.Stats().DroppedPackets.Increment() 577 return nil 578 } 579 580 opts := parseSynSegmentOptions(s) 581 if !ctx.useSynCookies() { 582 s.incRef() 583 atomic.AddInt32(&e.synRcvdCount, 1) 584 return e.handleSynSegment(ctx, s, &opts) 585 } 586 route, err := e.stack.FindRoute(s.nicID, s.dstAddr, s.srcAddr, s.netProto, false /* multicastLoop */) 587 if err != nil { 588 return err 589 } 590 defer route.Release() 591 592 // Send SYN without window scaling because we currently 593 // don't encode this information in the cookie. 594 // 595 // Enable Timestamp option if the original syn did have 596 // the timestamp option specified. 597 // 598 // Use the user supplied MSS on the listening socket for 599 // new connections, if available. 600 synOpts := header.TCPSynOptions{ 601 WS: -1, 602 TS: opts.TS, 603 TSVal: tcpTimeStamp(e.stack.Clock().NowMonotonic(), timeStampOffset(e.stack.Rand())), 604 TSEcr: opts.TSVal, 605 MSS: calculateAdvertisedMSS(e.userMSS, route), 606 } 607 cookie := ctx.createCookie(s.id, s.sequenceNumber, encodeMSS(opts.MSS)) 608 fields := tcpFields{ 609 id: s.id, 610 ttl: e.ttl, 611 tos: e.sendTOS, 612 flags: header.TCPFlagSyn | header.TCPFlagAck, 613 seq: cookie, 614 ack: s.sequenceNumber + 1, 615 rcvWnd: ctx.rcvWnd, 616 } 617 if err := e.sendSynTCP(route, fields, synOpts); err != nil { 618 return err 619 } 620 e.stack.Stats().TCP.ListenOverflowSynCookieSent.Increment() 621 return nil 622 623 case s.flags.Contains(header.TCPFlagAck): 624 if e.acceptQueueIsFull() { 625 // Silently drop the ack as the application can't accept 626 // the connection at this point. The ack will be 627 // retransmitted by the sender anyway and we can 628 // complete the connection at the time of retransmit if 629 // the backlog has space. 630 e.stack.Stats().TCP.ListenOverflowAckDrop.Increment() 631 e.stats.ReceiveErrors.ListenOverflowAckDrop.Increment() 632 e.stack.Stats().DroppedPackets.Increment() 633 return nil 634 } 635 636 iss := s.ackNumber - 1 637 irs := s.sequenceNumber - 1 638 639 // Since SYN cookies are in use this is potentially an ACK to a 640 // SYN-ACK we sent but don't have a half open connection state 641 // as cookies are being used to protect against a potential SYN 642 // flood. In such cases validate the cookie and if valid create 643 // a fully connected endpoint and deliver to the accept queue. 644 // 645 // If not, silently drop the ACK to avoid leaking information 646 // when under a potential syn flood attack. 647 // 648 // Validate the cookie. 649 data, ok := ctx.isCookieValid(s.id, iss, irs) 650 if !ok || int(data) >= len(mssTable) { 651 e.stack.Stats().TCP.ListenOverflowInvalidSynCookieRcvd.Increment() 652 e.stack.Stats().DroppedPackets.Increment() 653 654 // When not using SYN cookies, as per RFC 793, section 3.9, page 64: 655 // Any acknowledgment is bad if it arrives on a connection still in 656 // the LISTEN state. An acceptable reset segment should be formed 657 // for any arriving ACK-bearing segment. The RST should be 658 // formatted as follows: 659 // 660 // <SEQ=SEG.ACK><CTL=RST> 661 // 662 // Send a reset as this is an ACK for which there is no 663 // half open connections and we are not using cookies 664 // yet. 665 // 666 // The only time we should reach here when a connection 667 // was opened and closed really quickly and a delayed 668 // ACK was received from the sender. 669 return replyWithReset(e.stack, s, e.sendTOS, e.ttl) 670 } 671 e.stack.Stats().TCP.ListenOverflowSynCookieRcvd.Increment() 672 // Create newly accepted endpoint and deliver it. 673 rcvdSynOptions := &header.TCPSynOptions{ 674 MSS: mssTable[data], 675 // Disable Window scaling as original SYN is 676 // lost. 677 WS: -1, 678 } 679 680 // When syn cookies are in use we enable timestamp only 681 // if the ack specifies the timestamp option assuming 682 // that the other end did in fact negotiate the 683 // timestamp option in the original SYN. 684 if s.parsedOptions.TS { 685 rcvdSynOptions.TS = true 686 rcvdSynOptions.TSVal = s.parsedOptions.TSVal 687 rcvdSynOptions.TSEcr = s.parsedOptions.TSEcr 688 } 689 690 n, err := ctx.createConnectingEndpoint(s, rcvdSynOptions, &waiter.Queue{}) 691 if err != nil { 692 return err 693 } 694 695 n.mu.Lock() 696 697 // Propagate any inheritable options from the listening endpoint 698 // to the newly created endpoint. 699 e.propagateInheritableOptionsLocked(n) 700 701 if !n.reserveTupleLocked() { 702 n.mu.Unlock() 703 n.Close() 704 705 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 706 e.stats.FailedConnectionAttempts.Increment() 707 return nil 708 } 709 710 // Register new endpoint so that packets are routed to it. 711 if err := n.stack.RegisterTransportEndpoint( 712 n.effectiveNetProtos, 713 ProtocolNumber, 714 n.TransportEndpointInfo.ID, 715 n, 716 n.boundPortFlags, 717 n.boundBindToDevice, 718 ); err != nil { 719 n.mu.Unlock() 720 n.Close() 721 722 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 723 e.stats.FailedConnectionAttempts.Increment() 724 return err 725 } 726 727 n.isRegistered = true 728 729 // clear the tsOffset for the newly created 730 // endpoint as the Timestamp was already 731 // randomly offset when the original SYN-ACK was 732 // sent above. 733 n.TSOffset = 0 734 735 // Switch state to connected. 736 n.isConnectNotified = true 737 n.transitionToStateEstablishedLocked(&handshake{ 738 ep: n, 739 iss: iss, 740 ackNum: irs + 1, 741 rcvWnd: seqnum.Size(n.initialReceiveWindow()), 742 sndWnd: s.window, 743 rcvWndScale: e.rcvWndScaleForHandshake(), 744 sndWndScale: rcvdSynOptions.WS, 745 mss: rcvdSynOptions.MSS, 746 }) 747 748 // Requeue the segment if the ACK completing the handshake has more info 749 // to be procesed by the newly established endpoint. 750 if (s.flags.Contains(header.TCPFlagFin) || s.data.Size() > 0) && n.enqueueSegment(s) { 751 s.incRef() 752 n.newSegmentWaker.Assert() 753 } 754 755 // Do the delivery in a separate goroutine so 756 // that we don't block the listen loop in case 757 // the application is slow to accept or stops 758 // accepting. 759 // 760 // NOTE: This won't result in an unbounded 761 // number of goroutines as we do check before 762 // entering here that there was at least some 763 // space available in the backlog. 764 765 // Start the protocol goroutine. 766 n.startAcceptedLoop() 767 e.stack.Stats().TCP.PassiveConnectionOpenings.Increment() 768 go e.deliverAccepted(n, true /*withSynCookie*/) 769 return nil 770 771 default: 772 e.stack.Stats().DroppedPackets.Increment() 773 return nil 774 } 775 } 776 777 // protocolListenLoop is the main loop of a listening TCP endpoint. It runs in 778 // its own goroutine and is responsible for handling connection requests. 779 func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) { 780 e.mu.Lock() 781 v6Only := e.ops.GetV6Only() 782 ctx := newListenContext(e.stack, e, rcvWnd, v6Only, e.NetProto) 783 784 defer func() { 785 // Mark endpoint as closed. This will prevent goroutines running 786 // handleSynSegment() from attempting to queue new connections 787 // to the endpoint. 788 e.setEndpointState(StateClose) 789 790 // Close any endpoints in SYN-RCVD state. 791 ctx.closeAllPendingEndpoints() 792 793 // Do cleanup if needed. 794 e.completeWorkerLocked() 795 796 if e.drainDone != nil { 797 close(e.drainDone) 798 } 799 e.mu.Unlock() 800 801 e.drainClosingSegmentQueue() 802 803 // Notify waiters that the endpoint is shutdown. 804 e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) 805 }() 806 807 var s sleep.Sleeper 808 s.AddWaker(&e.notificationWaker, wakerForNotification) 809 s.AddWaker(&e.newSegmentWaker, wakerForNewSegment) 810 for { 811 e.mu.Unlock() 812 index, _ := s.Fetch(true) 813 e.mu.Lock() 814 switch index { 815 case wakerForNotification: 816 n := e.fetchNotifications() 817 if n¬ifyClose != 0 { 818 return 819 } 820 if n¬ifyDrain != 0 { 821 for !e.segmentQueue.empty() { 822 s := e.segmentQueue.dequeue() 823 // TODO(github.com/SagerNet/issue/4690): Better handle errors instead of 824 // silently dropping. 825 _ = e.handleListenSegment(ctx, s) 826 s.decRef() 827 } 828 close(e.drainDone) 829 e.mu.Unlock() 830 <-e.undrain 831 e.mu.Lock() 832 } 833 834 case wakerForNewSegment: 835 // Process at most maxSegmentsPerWake segments. 836 mayRequeue := true 837 for i := 0; i < maxSegmentsPerWake; i++ { 838 s := e.segmentQueue.dequeue() 839 if s == nil { 840 mayRequeue = false 841 break 842 } 843 844 // TODO(github.com/SagerNet/issue/4690): Better handle errors instead of 845 // silently dropping. 846 _ = e.handleListenSegment(ctx, s) 847 s.decRef() 848 } 849 850 // If the queue is not empty, make sure we'll wake up 851 // in the next iteration. 852 if mayRequeue && !e.segmentQueue.empty() { 853 e.newSegmentWaker.Assert() 854 } 855 } 856 } 857 }