github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/tcpip/transport/tcp/accept.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tcp
    16  
    17  import (
    18  	"crypto/sha1"
    19  	"encoding/binary"
    20  	"fmt"
    21  	"hash"
    22  	"io"
    23  	"sync/atomic"
    24  	"time"
    25  
    26  	"github.com/SagerNet/gvisor/pkg/sleep"
    27  	"github.com/SagerNet/gvisor/pkg/sync"
    28  	"github.com/SagerNet/gvisor/pkg/tcpip"
    29  	"github.com/SagerNet/gvisor/pkg/tcpip/header"
    30  	"github.com/SagerNet/gvisor/pkg/tcpip/ports"
    31  	"github.com/SagerNet/gvisor/pkg/tcpip/seqnum"
    32  	"github.com/SagerNet/gvisor/pkg/tcpip/stack"
    33  	"github.com/SagerNet/gvisor/pkg/waiter"
    34  )
    35  
    36  const (
    37  	// tsLen is the length, in bits, of the timestamp in the SYN cookie.
    38  	tsLen = 8
    39  
    40  	// tsMask is a mask for timestamp values (i.e., tsLen bits).
    41  	tsMask = (1 << tsLen) - 1
    42  
    43  	// tsOffset is the offset, in bits, of the timestamp in the SYN cookie.
    44  	tsOffset = 24
    45  
    46  	// hashMask is the mask for hash values (i.e., tsOffset bits).
    47  	hashMask = (1 << tsOffset) - 1
    48  
    49  	// maxTSDiff is the maximum allowed difference between a received cookie
    50  	// timestamp and the current timestamp. If the difference is greater
    51  	// than maxTSDiff, the cookie is expired.
    52  	maxTSDiff = 2
    53  )
    54  
    55  var (
    56  	// mssTable is a slice containing the possible MSS values that we
    57  	// encode in the SYN cookie with two bits.
    58  	mssTable = []uint16{536, 1300, 1440, 1460}
    59  )
    60  
    61  func encodeMSS(mss uint16) uint32 {
    62  	for i := len(mssTable) - 1; i > 0; i-- {
    63  		if mss >= mssTable[i] {
    64  			return uint32(i)
    65  		}
    66  	}
    67  	return 0
    68  }
    69  
    70  // listenContext is used by a listening endpoint to store state used while
    71  // listening for connections. This struct is allocated by the listen goroutine
    72  // and must not be accessed or have its methods called concurrently as they
    73  // may mutate the stored objects.
    74  type listenContext struct {
    75  	stack *stack.Stack
    76  
    77  	// rcvWnd is the receive window that is sent by this listening context
    78  	// in the initial SYN-ACK.
    79  	rcvWnd seqnum.Size
    80  
    81  	// nonce are random bytes that are initialized once when the context
    82  	// is created and used to seed the hash function when generating
    83  	// the SYN cookie.
    84  	nonce [2][sha1.BlockSize]byte
    85  
    86  	// listenEP is a reference to the listening endpoint associated with
    87  	// this context. Can be nil if the context is created by the forwarder.
    88  	listenEP *endpoint
    89  
    90  	// hasherMu protects hasher.
    91  	hasherMu sync.Mutex
    92  	// hasher is the hash function used to generate a SYN cookie.
    93  	hasher hash.Hash
    94  
    95  	// v6Only is true if listenEP is a dual stack socket and has the
    96  	// IPV6_V6ONLY option set.
    97  	v6Only bool
    98  
    99  	// netProto indicates the network protocol(IPv4/v6) for the listening
   100  	// endpoint.
   101  	netProto tcpip.NetworkProtocolNumber
   102  
   103  	// pendingMu protects pendingEndpoints. This should only be accessed
   104  	// by the listening endpoint's worker goroutine.
   105  	//
   106  	// Lock Ordering: listenEP.workerMu -> pendingMu
   107  	pendingMu sync.Mutex
   108  	// pending is used to wait for all pendingEndpoints to finish when
   109  	// a socket is closed.
   110  	pending sync.WaitGroup
   111  	// pendingEndpoints is a map of all endpoints for which a handshake is
   112  	// in progress.
   113  	pendingEndpoints map[stack.TransportEndpointID]*endpoint
   114  }
   115  
   116  // timeStamp returns an 8-bit timestamp with a granularity of 64 seconds.
   117  func timeStamp(clock tcpip.Clock) uint32 {
   118  	return uint32(clock.NowMonotonic().Sub(tcpip.MonotonicTime{}).Seconds()) >> 6 & tsMask
   119  }
   120  
   121  // newListenContext creates a new listen context.
   122  func newListenContext(stk *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size, v6Only bool, netProto tcpip.NetworkProtocolNumber) *listenContext {
   123  	l := &listenContext{
   124  		stack:            stk,
   125  		rcvWnd:           rcvWnd,
   126  		hasher:           sha1.New(),
   127  		v6Only:           v6Only,
   128  		netProto:         netProto,
   129  		listenEP:         listenEP,
   130  		pendingEndpoints: make(map[stack.TransportEndpointID]*endpoint),
   131  	}
   132  
   133  	for i := range l.nonce {
   134  		if _, err := io.ReadFull(stk.SecureRNG(), l.nonce[i][:]); err != nil {
   135  			panic(err)
   136  		}
   137  	}
   138  
   139  	return l
   140  }
   141  
   142  // cookieHash calculates the cookieHash for the given id, timestamp and nonce
   143  // index. The hash is used to create and validate cookies.
   144  func (l *listenContext) cookieHash(id stack.TransportEndpointID, ts uint32, nonceIndex int) uint32 {
   145  
   146  	// Initialize block with fixed-size data: local ports and v.
   147  	var payload [8]byte
   148  	binary.BigEndian.PutUint16(payload[0:], id.LocalPort)
   149  	binary.BigEndian.PutUint16(payload[2:], id.RemotePort)
   150  	binary.BigEndian.PutUint32(payload[4:], ts)
   151  
   152  	// Feed everything to the hasher.
   153  	l.hasherMu.Lock()
   154  	l.hasher.Reset()
   155  
   156  	// Per hash.Hash.Writer:
   157  	//
   158  	// It never returns an error.
   159  	l.hasher.Write(payload[:])
   160  	l.hasher.Write(l.nonce[nonceIndex][:])
   161  	l.hasher.Write([]byte(id.LocalAddress))
   162  	l.hasher.Write([]byte(id.RemoteAddress))
   163  
   164  	// Finalize the calculation of the hash and return the first 4 bytes.
   165  	h := l.hasher.Sum(nil)
   166  	l.hasherMu.Unlock()
   167  
   168  	return binary.BigEndian.Uint32(h[:])
   169  }
   170  
   171  // createCookie creates a SYN cookie for the given id and incoming sequence
   172  // number.
   173  func (l *listenContext) createCookie(id stack.TransportEndpointID, seq seqnum.Value, data uint32) seqnum.Value {
   174  	ts := timeStamp(l.stack.Clock())
   175  	v := l.cookieHash(id, 0, 0) + uint32(seq) + (ts << tsOffset)
   176  	v += (l.cookieHash(id, ts, 1) + data) & hashMask
   177  	return seqnum.Value(v)
   178  }
   179  
   180  // isCookieValid checks if the supplied cookie is valid for the given id and
   181  // sequence number. If it is, it also returns the data originally encoded in the
   182  // cookie when createCookie was called.
   183  func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnum.Value, seq seqnum.Value) (uint32, bool) {
   184  	ts := timeStamp(l.stack.Clock())
   185  	v := uint32(cookie) - l.cookieHash(id, 0, 0) - uint32(seq)
   186  	cookieTS := v >> tsOffset
   187  	if ((ts - cookieTS) & tsMask) > maxTSDiff {
   188  		return 0, false
   189  	}
   190  
   191  	return (v - l.cookieHash(id, cookieTS, 1)) & hashMask, true
   192  }
   193  
   194  func (l *listenContext) useSynCookies() bool {
   195  	var alwaysUseSynCookies tcpip.TCPAlwaysUseSynCookies
   196  	if err := l.stack.TransportProtocolOption(header.TCPProtocolNumber, &alwaysUseSynCookies); err != nil {
   197  		panic(fmt.Sprintf("TransportProtocolOption(%d, %T) = %s", header.TCPProtocolNumber, alwaysUseSynCookies, err))
   198  	}
   199  	return bool(alwaysUseSynCookies) || (l.listenEP != nil && l.listenEP.synRcvdBacklogFull())
   200  }
   201  
   202  // createConnectingEndpoint creates a new endpoint in a connecting state, with
   203  // the connection parameters given by the arguments.
   204  func (l *listenContext) createConnectingEndpoint(s *segment, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, tcpip.Error) {
   205  	// Create a new endpoint.
   206  	netProto := l.netProto
   207  	if netProto == 0 {
   208  		netProto = s.netProto
   209  	}
   210  
   211  	route, err := l.stack.FindRoute(s.nicID, s.dstAddr, s.srcAddr, s.netProto, false /* multicastLoop */)
   212  	if err != nil {
   213  		return nil, err
   214  	}
   215  
   216  	n := newEndpoint(l.stack, netProto, queue)
   217  	n.ops.SetV6Only(l.v6Only)
   218  	n.TransportEndpointInfo.ID = s.id
   219  	n.boundNICID = s.nicID
   220  	n.route = route
   221  	n.effectiveNetProtos = []tcpip.NetworkProtocolNumber{s.netProto}
   222  	n.ops.SetReceiveBufferSize(int64(l.rcvWnd), false /* notify */)
   223  	n.amss = calculateAdvertisedMSS(n.userMSS, n.route)
   224  	n.setEndpointState(StateConnecting)
   225  
   226  	n.maybeEnableTimestamp(rcvdSynOpts)
   227  	n.maybeEnableSACKPermitted(rcvdSynOpts)
   228  
   229  	n.initGSO()
   230  
   231  	// Bootstrap the auto tuning algorithm. Starting at zero will result in
   232  	// a large step function on the first window adjustment causing the
   233  	// window to grow to a really large value.
   234  	n.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes = n.initialReceiveWindow()
   235  
   236  	return n, nil
   237  }
   238  
   239  // startHandshake creates a new endpoint in connecting state and then sends
   240  // the SYN-ACK for the TCP 3-way handshake. It returns the state of the
   241  // handshake in progress, which includes the new endpoint in the SYN-RCVD
   242  // state.
   243  //
   244  // On success, a handshake h is returned with h.ep.mu held.
   245  //
   246  // Precondition: if l.listenEP != nil, l.listenEP.mu must be locked.
   247  func (l *listenContext) startHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue, owner tcpip.PacketOwner) (*handshake, tcpip.Error) {
   248  	// Create new endpoint.
   249  	irs := s.sequenceNumber
   250  	isn := generateSecureISN(s.id, l.stack.Clock(), l.stack.Seed())
   251  	ep, err := l.createConnectingEndpoint(s, opts, queue)
   252  	if err != nil {
   253  		return nil, err
   254  	}
   255  
   256  	// Lock the endpoint before registering to ensure that no out of
   257  	// band changes are possible due to incoming packets etc till
   258  	// the endpoint is done initializing.
   259  	ep.mu.Lock()
   260  	ep.owner = owner
   261  
   262  	// listenEP is nil when listenContext is used by tcp.Forwarder.
   263  	deferAccept := time.Duration(0)
   264  	if l.listenEP != nil {
   265  		if l.listenEP.EndpointState() != StateListen {
   266  
   267  			// Ensure we release any registrations done by the newly
   268  			// created endpoint.
   269  			ep.mu.Unlock()
   270  			ep.Close()
   271  
   272  			return nil, &tcpip.ErrConnectionAborted{}
   273  		}
   274  		l.addPendingEndpoint(ep)
   275  
   276  		// Propagate any inheritable options from the listening endpoint
   277  		// to the newly created endpoint.
   278  		l.listenEP.propagateInheritableOptionsLocked(ep)
   279  
   280  		if !ep.reserveTupleLocked() {
   281  			ep.mu.Unlock()
   282  			ep.Close()
   283  
   284  			l.removePendingEndpoint(ep)
   285  
   286  			return nil, &tcpip.ErrConnectionAborted{}
   287  		}
   288  
   289  		deferAccept = l.listenEP.deferAccept
   290  	}
   291  
   292  	// Register new endpoint so that packets are routed to it.
   293  	if err := ep.stack.RegisterTransportEndpoint(
   294  		ep.effectiveNetProtos,
   295  		ProtocolNumber,
   296  		ep.TransportEndpointInfo.ID,
   297  		ep,
   298  		ep.boundPortFlags,
   299  		ep.boundBindToDevice,
   300  	); err != nil {
   301  		ep.mu.Unlock()
   302  		ep.Close()
   303  
   304  		if l.listenEP != nil {
   305  			l.removePendingEndpoint(ep)
   306  		}
   307  
   308  		ep.drainClosingSegmentQueue()
   309  
   310  		return nil, err
   311  	}
   312  
   313  	ep.isRegistered = true
   314  
   315  	// Initialize and start the handshake.
   316  	h := ep.newPassiveHandshake(isn, irs, opts, deferAccept)
   317  	h.listenEP = l.listenEP
   318  	h.start()
   319  	return h, nil
   320  }
   321  
   322  // performHandshake performs a TCP 3-way handshake. On success, the new
   323  // established endpoint is returned with e.mu held.
   324  //
   325  // Precondition: if l.listenEP != nil, l.listenEP.mu must be locked.
   326  func (l *listenContext) performHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue, owner tcpip.PacketOwner) (*endpoint, tcpip.Error) {
   327  	h, err := l.startHandshake(s, opts, queue, owner)
   328  	if err != nil {
   329  		return nil, err
   330  	}
   331  	ep := h.ep
   332  
   333  	// N.B. the endpoint is generated above by startHandshake, and will be
   334  	// returned locked. This first call is forced.
   335  	if err := h.complete(); err != nil { // +checklocksforce
   336  		ep.stack.Stats().TCP.FailedConnectionAttempts.Increment()
   337  		ep.stats.FailedConnectionAttempts.Increment()
   338  		l.cleanupFailedHandshake(h)
   339  		return nil, err
   340  	}
   341  	l.cleanupCompletedHandshake(h)
   342  	return ep, nil
   343  }
   344  
   345  func (l *listenContext) addPendingEndpoint(n *endpoint) {
   346  	l.pendingMu.Lock()
   347  	l.pendingEndpoints[n.TransportEndpointInfo.ID] = n
   348  	l.pending.Add(1)
   349  	l.pendingMu.Unlock()
   350  }
   351  
   352  func (l *listenContext) removePendingEndpoint(n *endpoint) {
   353  	l.pendingMu.Lock()
   354  	delete(l.pendingEndpoints, n.TransportEndpointInfo.ID)
   355  	l.pending.Done()
   356  	l.pendingMu.Unlock()
   357  }
   358  
   359  func (l *listenContext) closeAllPendingEndpoints() {
   360  	l.pendingMu.Lock()
   361  	for _, n := range l.pendingEndpoints {
   362  		n.notifyProtocolGoroutine(notifyClose)
   363  	}
   364  	l.pendingMu.Unlock()
   365  	l.pending.Wait()
   366  }
   367  
   368  // Precondition: h.ep.mu must be held.
   369  // +checklocks:h.ep.mu
   370  func (l *listenContext) cleanupFailedHandshake(h *handshake) {
   371  	e := h.ep
   372  	e.mu.Unlock()
   373  	e.Close()
   374  	e.notifyAborted()
   375  	if l.listenEP != nil {
   376  		l.removePendingEndpoint(e)
   377  	}
   378  	e.drainClosingSegmentQueue()
   379  	e.h = nil
   380  }
   381  
   382  // cleanupCompletedHandshake transfers any state from the completed handshake to
   383  // the new endpoint.
   384  //
   385  // Precondition: h.ep.mu must be held.
   386  func (l *listenContext) cleanupCompletedHandshake(h *handshake) {
   387  	e := h.ep
   388  	if l.listenEP != nil {
   389  		l.removePendingEndpoint(e)
   390  	}
   391  	e.isConnectNotified = true
   392  
   393  	// Update the receive window scaling. We can't do it before the
   394  	// handshake because it's possible that the peer doesn't support window
   395  	// scaling.
   396  	e.rcv.RcvWndScale = e.h.effectiveRcvWndScale()
   397  
   398  	// Clean up handshake state stored in the endpoint so that it can be GCed.
   399  	e.h = nil
   400  }
   401  
   402  // deliverAccepted delivers the newly-accepted endpoint to the listener. If the
   403  // listener has transitioned out of the listen state (accepted is the zero
   404  // value), the new endpoint is reset instead.
   405  func (e *endpoint) deliverAccepted(n *endpoint, withSynCookie bool) {
   406  	e.mu.Lock()
   407  	e.pendingAccepted.Add(1)
   408  	e.mu.Unlock()
   409  	defer e.pendingAccepted.Done()
   410  
   411  	// Drop the lock before notifying to avoid deadlock in user-specified
   412  	// callbacks.
   413  	delivered := func() bool {
   414  		e.acceptMu.Lock()
   415  		defer e.acceptMu.Unlock()
   416  		for {
   417  			if e.accepted == (accepted{}) {
   418  				return false
   419  			}
   420  			if e.accepted.endpoints.Len() == e.accepted.cap {
   421  				e.acceptCond.Wait()
   422  				continue
   423  			}
   424  
   425  			e.accepted.endpoints.PushBack(n)
   426  			if !withSynCookie {
   427  				atomic.AddInt32(&e.synRcvdCount, -1)
   428  			}
   429  			return true
   430  		}
   431  	}()
   432  	if delivered {
   433  		e.waiterQueue.Notify(waiter.ReadableEvents)
   434  	} else {
   435  		n.notifyProtocolGoroutine(notifyReset)
   436  	}
   437  }
   438  
   439  // propagateInheritableOptionsLocked propagates any options set on the listening
   440  // endpoint to the newly created endpoint.
   441  //
   442  // Precondition: e.mu and n.mu must be held.
   443  func (e *endpoint) propagateInheritableOptionsLocked(n *endpoint) {
   444  	n.userTimeout = e.userTimeout
   445  	n.portFlags = e.portFlags
   446  	n.boundBindToDevice = e.boundBindToDevice
   447  	n.boundPortFlags = e.boundPortFlags
   448  	n.userMSS = e.userMSS
   449  }
   450  
   451  // reserveTupleLocked reserves an accepted endpoint's tuple.
   452  //
   453  // Preconditions:
   454  // * propagateInheritableOptionsLocked has been called.
   455  // * e.mu is held.
   456  func (e *endpoint) reserveTupleLocked() bool {
   457  	dest := tcpip.FullAddress{
   458  		Addr: e.TransportEndpointInfo.ID.RemoteAddress,
   459  		Port: e.TransportEndpointInfo.ID.RemotePort,
   460  	}
   461  	portRes := ports.Reservation{
   462  		Networks:     e.effectiveNetProtos,
   463  		Transport:    ProtocolNumber,
   464  		Addr:         e.TransportEndpointInfo.ID.LocalAddress,
   465  		Port:         e.TransportEndpointInfo.ID.LocalPort,
   466  		Flags:        e.boundPortFlags,
   467  		BindToDevice: e.boundBindToDevice,
   468  		Dest:         dest,
   469  	}
   470  	if !e.stack.ReserveTuple(portRes) {
   471  		e.stack.Stats().TCP.FailedPortReservations.Increment()
   472  		return false
   473  	}
   474  
   475  	e.isPortReserved = true
   476  	e.boundDest = dest
   477  	return true
   478  }
   479  
   480  // notifyAborted wakes up any waiters on registered, but not accepted
   481  // endpoints.
   482  //
   483  // This is strictly not required normally as a socket that was never accepted
   484  // can't really have any registered waiters except when stack.Wait() is called
   485  // which waits for all registered endpoints to stop and expects an EventHUp.
   486  func (e *endpoint) notifyAborted() {
   487  	e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
   488  }
   489  
   490  // handleSynSegment is called in its own goroutine once the listening endpoint
   491  // receives a SYN segment. It is responsible for completing the handshake and
   492  // queueing the new endpoint for acceptance.
   493  //
   494  // A limited number of these goroutines are allowed before TCP starts using SYN
   495  // cookies to accept connections.
   496  //
   497  // Precondition: if ctx.listenEP != nil, ctx.listenEP.mu must be locked.
   498  func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header.TCPSynOptions) tcpip.Error {
   499  	defer s.decRef()
   500  
   501  	h, err := ctx.startHandshake(s, opts, &waiter.Queue{}, e.owner)
   502  	if err != nil {
   503  		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
   504  		e.stats.FailedConnectionAttempts.Increment()
   505  		atomic.AddInt32(&e.synRcvdCount, -1)
   506  		return err
   507  	}
   508  
   509  	go func() {
   510  		// Note that startHandshake returns a locked endpoint. The
   511  		// force call here just makes it so.
   512  		if err := h.complete(); err != nil { // +checklocksforce
   513  			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
   514  			e.stats.FailedConnectionAttempts.Increment()
   515  			ctx.cleanupFailedHandshake(h)
   516  			atomic.AddInt32(&e.synRcvdCount, -1)
   517  			return
   518  		}
   519  		ctx.cleanupCompletedHandshake(h)
   520  		h.ep.startAcceptedLoop()
   521  		e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
   522  		e.deliverAccepted(h.ep, false /*withSynCookie*/)
   523  	}()
   524  
   525  	return nil
   526  }
   527  
   528  func (e *endpoint) synRcvdBacklogFull() bool {
   529  	e.acceptMu.Lock()
   530  	acceptedCap := e.accepted.cap
   531  	e.acceptMu.Unlock()
   532  	// The capacity of the accepted queue would always be one greater than the
   533  	// listen backlog. But, the SYNRCVD connections count is always checked
   534  	// against the listen backlog value for Linux parity reason.
   535  	// https://github.com/torvalds/linux/blob/7acac4b3196/include/net/inet_connection_sock.h#L280
   536  	//
   537  	// We maintain an equality check here as the synRcvdCount is incremented
   538  	// and compared only from a single listener context and the capacity of
   539  	// the accepted queue can only increase by a new listen call.
   540  	return int(atomic.LoadInt32(&e.synRcvdCount)) == acceptedCap-1
   541  }
   542  
   543  func (e *endpoint) acceptQueueIsFull() bool {
   544  	e.acceptMu.Lock()
   545  	full := e.accepted != (accepted{}) && e.accepted.endpoints.Len() == e.accepted.cap
   546  	e.acceptMu.Unlock()
   547  	return full
   548  }
   549  
   550  // handleListenSegment is called when a listening endpoint receives a segment
   551  // and needs to handle it.
   552  //
   553  // Precondition: if ctx.listenEP != nil, ctx.listenEP.mu must be locked.
   554  func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Error {
   555  	e.rcvQueueInfo.rcvQueueMu.Lock()
   556  	rcvClosed := e.rcvQueueInfo.RcvClosed
   557  	e.rcvQueueInfo.rcvQueueMu.Unlock()
   558  	if rcvClosed || s.flags.Contains(header.TCPFlagSyn|header.TCPFlagAck) {
   559  		// If the endpoint is shutdown, reply with reset.
   560  		//
   561  		// RFC 793 section 3.4 page 35 (figure 12) outlines that a RST
   562  		// must be sent in response to a SYN-ACK while in the listen
   563  		// state to prevent completing a handshake from an old SYN.
   564  		return replyWithReset(e.stack, s, e.sendTOS, e.ttl)
   565  	}
   566  
   567  	switch {
   568  	case s.flags.Contains(header.TCPFlagRst):
   569  		e.stack.Stats().DroppedPackets.Increment()
   570  		return nil
   571  
   572  	case s.flags == header.TCPFlagSyn:
   573  		if e.acceptQueueIsFull() {
   574  			e.stack.Stats().TCP.ListenOverflowSynDrop.Increment()
   575  			e.stats.ReceiveErrors.ListenOverflowSynDrop.Increment()
   576  			e.stack.Stats().DroppedPackets.Increment()
   577  			return nil
   578  		}
   579  
   580  		opts := parseSynSegmentOptions(s)
   581  		if !ctx.useSynCookies() {
   582  			s.incRef()
   583  			atomic.AddInt32(&e.synRcvdCount, 1)
   584  			return e.handleSynSegment(ctx, s, &opts)
   585  		}
   586  		route, err := e.stack.FindRoute(s.nicID, s.dstAddr, s.srcAddr, s.netProto, false /* multicastLoop */)
   587  		if err != nil {
   588  			return err
   589  		}
   590  		defer route.Release()
   591  
   592  		// Send SYN without window scaling because we currently
   593  		// don't encode this information in the cookie.
   594  		//
   595  		// Enable Timestamp option if the original syn did have
   596  		// the timestamp option specified.
   597  		//
   598  		// Use the user supplied MSS on the listening socket for
   599  		// new connections, if available.
   600  		synOpts := header.TCPSynOptions{
   601  			WS:    -1,
   602  			TS:    opts.TS,
   603  			TSVal: tcpTimeStamp(e.stack.Clock().NowMonotonic(), timeStampOffset(e.stack.Rand())),
   604  			TSEcr: opts.TSVal,
   605  			MSS:   calculateAdvertisedMSS(e.userMSS, route),
   606  		}
   607  		cookie := ctx.createCookie(s.id, s.sequenceNumber, encodeMSS(opts.MSS))
   608  		fields := tcpFields{
   609  			id:     s.id,
   610  			ttl:    e.ttl,
   611  			tos:    e.sendTOS,
   612  			flags:  header.TCPFlagSyn | header.TCPFlagAck,
   613  			seq:    cookie,
   614  			ack:    s.sequenceNumber + 1,
   615  			rcvWnd: ctx.rcvWnd,
   616  		}
   617  		if err := e.sendSynTCP(route, fields, synOpts); err != nil {
   618  			return err
   619  		}
   620  		e.stack.Stats().TCP.ListenOverflowSynCookieSent.Increment()
   621  		return nil
   622  
   623  	case s.flags.Contains(header.TCPFlagAck):
   624  		if e.acceptQueueIsFull() {
   625  			// Silently drop the ack as the application can't accept
   626  			// the connection at this point. The ack will be
   627  			// retransmitted by the sender anyway and we can
   628  			// complete the connection at the time of retransmit if
   629  			// the backlog has space.
   630  			e.stack.Stats().TCP.ListenOverflowAckDrop.Increment()
   631  			e.stats.ReceiveErrors.ListenOverflowAckDrop.Increment()
   632  			e.stack.Stats().DroppedPackets.Increment()
   633  			return nil
   634  		}
   635  
   636  		iss := s.ackNumber - 1
   637  		irs := s.sequenceNumber - 1
   638  
   639  		// Since SYN cookies are in use this is potentially an ACK to a
   640  		// SYN-ACK we sent but don't have a half open connection state
   641  		// as cookies are being used to protect against a potential SYN
   642  		// flood. In such cases validate the cookie and if valid create
   643  		// a fully connected endpoint and deliver to the accept queue.
   644  		//
   645  		// If not, silently drop the ACK to avoid leaking information
   646  		// when under a potential syn flood attack.
   647  		//
   648  		// Validate the cookie.
   649  		data, ok := ctx.isCookieValid(s.id, iss, irs)
   650  		if !ok || int(data) >= len(mssTable) {
   651  			e.stack.Stats().TCP.ListenOverflowInvalidSynCookieRcvd.Increment()
   652  			e.stack.Stats().DroppedPackets.Increment()
   653  
   654  			// When not using SYN cookies, as per RFC 793, section 3.9, page 64:
   655  			// Any acknowledgment is bad if it arrives on a connection still in
   656  			// the LISTEN state.  An acceptable reset segment should be formed
   657  			// for any arriving ACK-bearing segment.  The RST should be
   658  			// formatted as follows:
   659  			//
   660  			//  <SEQ=SEG.ACK><CTL=RST>
   661  			//
   662  			// Send a reset as this is an ACK for which there is no
   663  			// half open connections and we are not using cookies
   664  			// yet.
   665  			//
   666  			// The only time we should reach here when a connection
   667  			// was opened and closed really quickly and a delayed
   668  			// ACK was received from the sender.
   669  			return replyWithReset(e.stack, s, e.sendTOS, e.ttl)
   670  		}
   671  		e.stack.Stats().TCP.ListenOverflowSynCookieRcvd.Increment()
   672  		// Create newly accepted endpoint and deliver it.
   673  		rcvdSynOptions := &header.TCPSynOptions{
   674  			MSS: mssTable[data],
   675  			// Disable Window scaling as original SYN is
   676  			// lost.
   677  			WS: -1,
   678  		}
   679  
   680  		// When syn cookies are in use we enable timestamp only
   681  		// if the ack specifies the timestamp option assuming
   682  		// that the other end did in fact negotiate the
   683  		// timestamp option in the original SYN.
   684  		if s.parsedOptions.TS {
   685  			rcvdSynOptions.TS = true
   686  			rcvdSynOptions.TSVal = s.parsedOptions.TSVal
   687  			rcvdSynOptions.TSEcr = s.parsedOptions.TSEcr
   688  		}
   689  
   690  		n, err := ctx.createConnectingEndpoint(s, rcvdSynOptions, &waiter.Queue{})
   691  		if err != nil {
   692  			return err
   693  		}
   694  
   695  		n.mu.Lock()
   696  
   697  		// Propagate any inheritable options from the listening endpoint
   698  		// to the newly created endpoint.
   699  		e.propagateInheritableOptionsLocked(n)
   700  
   701  		if !n.reserveTupleLocked() {
   702  			n.mu.Unlock()
   703  			n.Close()
   704  
   705  			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
   706  			e.stats.FailedConnectionAttempts.Increment()
   707  			return nil
   708  		}
   709  
   710  		// Register new endpoint so that packets are routed to it.
   711  		if err := n.stack.RegisterTransportEndpoint(
   712  			n.effectiveNetProtos,
   713  			ProtocolNumber,
   714  			n.TransportEndpointInfo.ID,
   715  			n,
   716  			n.boundPortFlags,
   717  			n.boundBindToDevice,
   718  		); err != nil {
   719  			n.mu.Unlock()
   720  			n.Close()
   721  
   722  			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
   723  			e.stats.FailedConnectionAttempts.Increment()
   724  			return err
   725  		}
   726  
   727  		n.isRegistered = true
   728  
   729  		// clear the tsOffset for the newly created
   730  		// endpoint as the Timestamp was already
   731  		// randomly offset when the original SYN-ACK was
   732  		// sent above.
   733  		n.TSOffset = 0
   734  
   735  		// Switch state to connected.
   736  		n.isConnectNotified = true
   737  		n.transitionToStateEstablishedLocked(&handshake{
   738  			ep:          n,
   739  			iss:         iss,
   740  			ackNum:      irs + 1,
   741  			rcvWnd:      seqnum.Size(n.initialReceiveWindow()),
   742  			sndWnd:      s.window,
   743  			rcvWndScale: e.rcvWndScaleForHandshake(),
   744  			sndWndScale: rcvdSynOptions.WS,
   745  			mss:         rcvdSynOptions.MSS,
   746  		})
   747  
   748  		// Requeue the segment if the ACK completing the handshake has more info
   749  		// to be procesed by the newly established endpoint.
   750  		if (s.flags.Contains(header.TCPFlagFin) || s.data.Size() > 0) && n.enqueueSegment(s) {
   751  			s.incRef()
   752  			n.newSegmentWaker.Assert()
   753  		}
   754  
   755  		// Do the delivery in a separate goroutine so
   756  		// that we don't block the listen loop in case
   757  		// the application is slow to accept or stops
   758  		// accepting.
   759  		//
   760  		// NOTE: This won't result in an unbounded
   761  		// number of goroutines as we do check before
   762  		// entering here that there was at least some
   763  		// space available in the backlog.
   764  
   765  		// Start the protocol goroutine.
   766  		n.startAcceptedLoop()
   767  		e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
   768  		go e.deliverAccepted(n, true /*withSynCookie*/)
   769  		return nil
   770  
   771  	default:
   772  		e.stack.Stats().DroppedPackets.Increment()
   773  		return nil
   774  	}
   775  }
   776  
   777  // protocolListenLoop is the main loop of a listening TCP endpoint. It runs in
   778  // its own goroutine and is responsible for handling connection requests.
   779  func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) {
   780  	e.mu.Lock()
   781  	v6Only := e.ops.GetV6Only()
   782  	ctx := newListenContext(e.stack, e, rcvWnd, v6Only, e.NetProto)
   783  
   784  	defer func() {
   785  		// Mark endpoint as closed. This will prevent goroutines running
   786  		// handleSynSegment() from attempting to queue new connections
   787  		// to the endpoint.
   788  		e.setEndpointState(StateClose)
   789  
   790  		// Close any endpoints in SYN-RCVD state.
   791  		ctx.closeAllPendingEndpoints()
   792  
   793  		// Do cleanup if needed.
   794  		e.completeWorkerLocked()
   795  
   796  		if e.drainDone != nil {
   797  			close(e.drainDone)
   798  		}
   799  		e.mu.Unlock()
   800  
   801  		e.drainClosingSegmentQueue()
   802  
   803  		// Notify waiters that the endpoint is shutdown.
   804  		e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr)
   805  	}()
   806  
   807  	var s sleep.Sleeper
   808  	s.AddWaker(&e.notificationWaker, wakerForNotification)
   809  	s.AddWaker(&e.newSegmentWaker, wakerForNewSegment)
   810  	for {
   811  		e.mu.Unlock()
   812  		index, _ := s.Fetch(true)
   813  		e.mu.Lock()
   814  		switch index {
   815  		case wakerForNotification:
   816  			n := e.fetchNotifications()
   817  			if n&notifyClose != 0 {
   818  				return
   819  			}
   820  			if n&notifyDrain != 0 {
   821  				for !e.segmentQueue.empty() {
   822  					s := e.segmentQueue.dequeue()
   823  					// TODO(github.com/SagerNet/issue/4690): Better handle errors instead of
   824  					// silently dropping.
   825  					_ = e.handleListenSegment(ctx, s)
   826  					s.decRef()
   827  				}
   828  				close(e.drainDone)
   829  				e.mu.Unlock()
   830  				<-e.undrain
   831  				e.mu.Lock()
   832  			}
   833  
   834  		case wakerForNewSegment:
   835  			// Process at most maxSegmentsPerWake segments.
   836  			mayRequeue := true
   837  			for i := 0; i < maxSegmentsPerWake; i++ {
   838  				s := e.segmentQueue.dequeue()
   839  				if s == nil {
   840  					mayRequeue = false
   841  					break
   842  				}
   843  
   844  				// TODO(github.com/SagerNet/issue/4690): Better handle errors instead of
   845  				// silently dropping.
   846  				_ = e.handleListenSegment(ctx, s)
   847  				s.decRef()
   848  			}
   849  
   850  			// If the queue is not empty, make sure we'll wake up
   851  			// in the next iteration.
   852  			if mayRequeue && !e.segmentQueue.empty() {
   853  				e.newSegmentWaker.Assert()
   854  			}
   855  		}
   856  	}
   857  }