github.com/sagernet/gvisor@v0.0.0-20240428053021-e691de28565f/pkg/tcpip/transport/tcp/dispatcher.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "encoding/binary" 19 "fmt" 20 "math/rand" 21 22 "github.com/sagernet/gvisor/pkg/sleep" 23 "github.com/sagernet/gvisor/pkg/sync" 24 "github.com/sagernet/gvisor/pkg/tcpip" 25 "github.com/sagernet/gvisor/pkg/tcpip/hash/jenkins" 26 "github.com/sagernet/gvisor/pkg/tcpip/header" 27 "github.com/sagernet/gvisor/pkg/tcpip/stack" 28 "github.com/sagernet/gvisor/pkg/waiter" 29 ) 30 31 // epQueue is a queue of endpoints. 32 type epQueue struct { 33 mu sync.Mutex 34 list endpointList 35 } 36 37 // enqueue adds e to the queue if the endpoint is not already on the queue. 38 func (q *epQueue) enqueue(e *Endpoint) { 39 q.mu.Lock() 40 defer q.mu.Unlock() 41 e.pendingProcessingMu.Lock() 42 defer e.pendingProcessingMu.Unlock() 43 44 if e.pendingProcessing { 45 return 46 } 47 q.list.PushBack(e) 48 e.pendingProcessing = true 49 } 50 51 // dequeue removes and returns the first element from the queue if available, 52 // returns nil otherwise. 53 func (q *epQueue) dequeue() *Endpoint { 54 q.mu.Lock() 55 if e := q.list.Front(); e != nil { 56 q.list.Remove(e) 57 e.pendingProcessingMu.Lock() 58 e.pendingProcessing = false 59 e.pendingProcessingMu.Unlock() 60 q.mu.Unlock() 61 return e 62 } 63 q.mu.Unlock() 64 return nil 65 } 66 67 // empty returns true if the queue is empty, false otherwise. 68 func (q *epQueue) empty() bool { 69 q.mu.Lock() 70 v := q.list.Empty() 71 q.mu.Unlock() 72 return v 73 } 74 75 // processor is responsible for processing packets queued to a tcp endpoint. 76 type processor struct { 77 epQ epQueue 78 sleeper sleep.Sleeper 79 newEndpointWaker sleep.Waker 80 closeWaker sleep.Waker 81 pauseWaker sleep.Waker 82 pauseChan chan struct{} 83 resumeChan chan struct{} 84 } 85 86 func (p *processor) close() { 87 p.closeWaker.Assert() 88 } 89 90 func (p *processor) queueEndpoint(ep *Endpoint) { 91 // Queue an endpoint for processing by the processor goroutine. 92 p.epQ.enqueue(ep) 93 p.newEndpointWaker.Assert() 94 } 95 96 // deliverAccepted delivers a passively connected endpoint to the accept queue 97 // of its associated listening endpoint. 98 // 99 // +checklocks:ep.mu 100 func deliverAccepted(ep *Endpoint) bool { 101 lEP := ep.h.listenEP 102 lEP.acceptMu.Lock() 103 104 // Remove endpoint from list of pendingEndpoints as the handshake is now 105 // complete. 106 delete(lEP.acceptQueue.pendingEndpoints, ep) 107 // Deliver this endpoint to the listening socket's accept queue. 108 if lEP.acceptQueue.capacity == 0 { 109 lEP.acceptMu.Unlock() 110 return false 111 } 112 113 // NOTE: We always queue the endpoint and on purpose do not check if 114 // accept queue is full at this point. This is similar to linux because 115 // two racing incoming ACK's can both pass the acceptQueue.isFull check 116 // and proceed to ESTABLISHED state. In such a case its better to 117 // deliver both even if it temporarily exceeds the queue limit rather 118 // than drop a connection that is fully connected. 119 // 120 // For reference see: 121 // https://github.com/torvalds/linux/blob/169e77764adc041b1dacba84ea90516a895d43b2/net/ipv4/tcp_minisocks.c#L764 122 // https://github.com/torvalds/linux/blob/169e77764adc041b1dacba84ea90516a895d43b2/net/ipv4/tcp_ipv4.c#L1500 123 lEP.acceptQueue.endpoints.PushBack(ep) 124 lEP.acceptMu.Unlock() 125 ep.h.listenEP.waiterQueue.Notify(waiter.ReadableEvents) 126 127 return true 128 } 129 130 // handleConnecting is responsible for TCP processing for an endpoint in one of 131 // the connecting states. 132 func handleConnecting(ep *Endpoint) { 133 if !ep.TryLock() { 134 return 135 } 136 cleanup := func() { 137 ep.mu.Unlock() 138 ep.drainClosingSegmentQueue() 139 ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 140 } 141 if !ep.EndpointState().connecting() { 142 // If the endpoint has already transitioned out of a connecting 143 // stage then just return (only possible if it was closed or 144 // timed out by the time we got around to processing the wakeup. 145 ep.mu.Unlock() 146 return 147 } 148 if err := ep.h.processSegments(); err != nil { // +checklocksforce:ep.h.ep.mu 149 // handshake failed. clean up the tcp endpoint and handshake 150 // state. 151 if lEP := ep.h.listenEP; lEP != nil { 152 lEP.acceptMu.Lock() 153 delete(lEP.acceptQueue.pendingEndpoints, ep) 154 lEP.acceptMu.Unlock() 155 } 156 ep.handshakeFailed(err) 157 cleanup() 158 return 159 } 160 161 if ep.EndpointState() == StateEstablished && ep.h.listenEP != nil { 162 ep.isConnectNotified = true 163 ep.stack.Stats().TCP.PassiveConnectionOpenings.Increment() 164 if !deliverAccepted(ep) { 165 ep.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) 166 cleanup() 167 return 168 } 169 } 170 ep.mu.Unlock() 171 } 172 173 // handleConnected is responsible for TCP processing for an endpoint in one of 174 // the connected states(StateEstablished, StateFinWait1 etc.) 175 func handleConnected(ep *Endpoint) { 176 if !ep.TryLock() { 177 return 178 } 179 180 if !ep.EndpointState().connected() { 181 // If the endpoint has already transitioned out of a connected 182 // state then just return (only possible if it was closed or 183 // timed out by the time we got around to processing the wakeup. 184 ep.mu.Unlock() 185 return 186 } 187 188 // NOTE: We read this outside of e.mu lock which means that by the time 189 // we get to handleSegments the endpoint may not be in ESTABLISHED. But 190 // this should be fine as all normal shutdown states are handled by 191 // handleSegmentsLocked. 192 switch err := ep.handleSegmentsLocked(); { 193 case err != nil: 194 // Send any active resets if required. 195 ep.resetConnectionLocked(err) 196 fallthrough 197 case ep.EndpointState() == StateClose: 198 ep.mu.Unlock() 199 ep.drainClosingSegmentQueue() 200 ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 201 return 202 case ep.EndpointState() == StateTimeWait: 203 startTimeWait(ep) 204 } 205 ep.mu.Unlock() 206 } 207 208 // startTimeWait starts a new goroutine to handle TIME-WAIT. 209 // 210 // +checklocks:ep.mu 211 func startTimeWait(ep *Endpoint) { 212 // Disable close timer as we are now entering real TIME_WAIT. 213 if ep.finWait2Timer != nil { 214 ep.finWait2Timer.Stop() 215 } 216 // Wake up any waiters before we start TIME-WAIT. 217 ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 218 timeWaitDuration := ep.getTimeWaitDuration() 219 ep.timeWaitTimer = ep.stack.Clock().AfterFunc(timeWaitDuration, ep.timeWaitTimerExpired) 220 } 221 222 // handleTimeWait is responsible for TCP processing for an endpoint in TIME-WAIT 223 // state. 224 func handleTimeWait(ep *Endpoint) { 225 if !ep.TryLock() { 226 return 227 } 228 229 if ep.EndpointState() != StateTimeWait { 230 // If the endpoint has already transitioned out of a TIME-WAIT 231 // state then just return (only possible if it was closed or 232 // timed out by the time we got around to processing the wakeup. 233 ep.mu.Unlock() 234 return 235 } 236 237 extendTimeWait, reuseTW := ep.handleTimeWaitSegments() 238 if reuseTW != nil { 239 ep.transitionToStateCloseLocked() 240 ep.mu.Unlock() 241 ep.drainClosingSegmentQueue() 242 ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 243 reuseTW() 244 return 245 } 246 if extendTimeWait { 247 ep.timeWaitTimer.Reset(ep.getTimeWaitDuration()) 248 } 249 ep.mu.Unlock() 250 } 251 252 // handleListen is responsible for TCP processing for an endpoint in LISTEN 253 // state. 254 func handleListen(ep *Endpoint) { 255 if !ep.TryLock() { 256 return 257 } 258 defer ep.mu.Unlock() 259 260 if ep.EndpointState() != StateListen { 261 // If the endpoint has already transitioned out of a LISTEN 262 // state then just return (only possible if it was closed or 263 // shutdown). 264 return 265 } 266 267 for i := 0; i < maxSegmentsPerWake; i++ { 268 s := ep.segmentQueue.dequeue() 269 if s == nil { 270 break 271 } 272 273 // TODO(gvisor.dev/issue/4690): Better handle errors instead of 274 // silently dropping. 275 _ = ep.handleListenSegment(ep.listenCtx, s) 276 s.DecRef() 277 } 278 } 279 280 // start runs the main loop for a processor which is responsible for all TCP 281 // processing for TCP endpoints. 282 func (p *processor) start(wg *sync.WaitGroup) { 283 defer wg.Done() 284 defer p.sleeper.Done() 285 286 for { 287 switch w := p.sleeper.Fetch(true); { 288 case w == &p.closeWaker: 289 return 290 case w == &p.pauseWaker: 291 if !p.epQ.empty() { 292 p.newEndpointWaker.Assert() 293 p.pauseWaker.Assert() 294 continue 295 } else { 296 p.pauseChan <- struct{}{} 297 <-p.resumeChan 298 } 299 case w == &p.newEndpointWaker: 300 for { 301 ep := p.epQ.dequeue() 302 if ep == nil { 303 break 304 } 305 if ep.segmentQueue.empty() { 306 continue 307 } 308 switch state := ep.EndpointState(); { 309 case state.connecting(): 310 handleConnecting(ep) 311 case state.connected() && state != StateTimeWait: 312 handleConnected(ep) 313 case state == StateTimeWait: 314 handleTimeWait(ep) 315 case state == StateListen: 316 handleListen(ep) 317 case state == StateError || state == StateClose: 318 // Try to redeliver any still queued 319 // packets to another endpoint or send a 320 // RST if it can't be delivered. 321 ep.mu.Lock() 322 if st := ep.EndpointState(); st == StateError || st == StateClose { 323 ep.drainClosingSegmentQueue() 324 } 325 ep.mu.Unlock() 326 default: 327 panic(fmt.Sprintf("unexpected tcp state in processor: %v", state)) 328 } 329 // If there are more segments to process and the 330 // endpoint lock is not held by user then 331 // requeue this endpoint for processing. 332 if !ep.segmentQueue.empty() && !ep.isOwnedByUser() { 333 p.epQ.enqueue(ep) 334 } 335 } 336 } 337 } 338 } 339 340 // pause pauses the processor loop. 341 func (p *processor) pause() chan struct{} { 342 p.pauseWaker.Assert() 343 return p.pauseChan 344 } 345 346 // resume resumes a previously paused loop. 347 // 348 // Precondition: Pause must have been called previously. 349 func (p *processor) resume() { 350 p.resumeChan <- struct{}{} 351 } 352 353 // dispatcher manages a pool of TCP endpoint processors which are responsible 354 // for the processing of inbound segments. This fixed pool of processor 355 // goroutines do full tcp processing. The processor is selected based on the 356 // hash of the endpoint id to ensure that delivery for the same endpoint happens 357 // in-order. 358 type dispatcher struct { 359 processors []processor 360 wg sync.WaitGroup 361 hasher jenkinsHasher 362 mu sync.Mutex 363 // +checklocks:mu 364 paused bool 365 // +checklocks:mu 366 closed bool 367 } 368 369 // init initializes a dispatcher and starts the main loop for all the processors 370 // owned by this dispatcher. 371 func (d *dispatcher) init(rng *rand.Rand, nProcessors int) { 372 d.close() 373 d.wait() 374 375 d.mu.Lock() 376 defer d.mu.Unlock() 377 d.closed = false 378 d.processors = make([]processor, nProcessors) 379 d.hasher = jenkinsHasher{seed: rng.Uint32()} 380 for i := range d.processors { 381 p := &d.processors[i] 382 p.sleeper.AddWaker(&p.newEndpointWaker) 383 p.sleeper.AddWaker(&p.closeWaker) 384 p.sleeper.AddWaker(&p.pauseWaker) 385 p.pauseChan = make(chan struct{}) 386 p.resumeChan = make(chan struct{}) 387 d.wg.Add(1) 388 // NB: sleeper-waker registration must happen synchronously to avoid races 389 // with `close`. It's possible to pull all this logic into `start`, but 390 // that results in a heap-allocated function literal. 391 go p.start(&d.wg) 392 } 393 } 394 395 // close closes a dispatcher and its processors. 396 func (d *dispatcher) close() { 397 d.mu.Lock() 398 d.closed = true 399 d.mu.Unlock() 400 for i := range d.processors { 401 d.processors[i].close() 402 } 403 } 404 405 // wait waits for all processor goroutines to end. 406 func (d *dispatcher) wait() { 407 d.wg.Wait() 408 } 409 410 // queuePacket queues an incoming packet to the matching tcp endpoint and 411 // also queues the endpoint to a processor queue for processing. 412 func (d *dispatcher) queuePacket(stackEP stack.TransportEndpoint, id stack.TransportEndpointID, clock tcpip.Clock, pkt *stack.PacketBuffer) { 413 d.mu.Lock() 414 closed := d.closed 415 d.mu.Unlock() 416 417 if closed { 418 return 419 } 420 421 ep := stackEP.(*Endpoint) 422 423 s, err := newIncomingSegment(id, clock, pkt) 424 if err != nil { 425 ep.stack.Stats().TCP.InvalidSegmentsReceived.Increment() 426 ep.stats.ReceiveErrors.MalformedPacketsReceived.Increment() 427 return 428 } 429 defer s.DecRef() 430 431 if !s.csumValid { 432 ep.stack.Stats().TCP.ChecksumErrors.Increment() 433 ep.stats.ReceiveErrors.ChecksumErrors.Increment() 434 return 435 } 436 437 ep.stack.Stats().TCP.ValidSegmentsReceived.Increment() 438 ep.stats.SegmentsReceived.Increment() 439 if (s.flags & header.TCPFlagRst) != 0 { 440 ep.stack.Stats().TCP.ResetsReceived.Increment() 441 } 442 443 if !ep.enqueueSegment(s) { 444 return 445 } 446 447 // Only wakeup the processor if endpoint lock is not held by a user 448 // goroutine as endpoint.UnlockUser will wake up the processor if the 449 // segment queue is not empty. 450 if !ep.isOwnedByUser() { 451 d.selectProcessor(id).queueEndpoint(ep) 452 } 453 } 454 455 // selectProcessor uses a hash of the transport endpoint ID to queue the 456 // endpoint to a specific processor. This is required to main TCP ordering as 457 // queueing the same endpoint to multiple processors can *potentially* result in 458 // out of order processing of incoming segments. It also ensures that a dispatcher 459 // evenly loads the processor goroutines. 460 func (d *dispatcher) selectProcessor(id stack.TransportEndpointID) *processor { 461 return &d.processors[d.hasher.hash(id)%uint32(len(d.processors))] 462 } 463 464 // pause pauses a dispatcher and all its processor goroutines. 465 func (d *dispatcher) pause() { 466 d.mu.Lock() 467 d.paused = true 468 d.mu.Unlock() 469 for i := range d.processors { 470 <-d.processors[i].pause() 471 } 472 } 473 474 // resume resumes a previously paused dispatcher and its processor goroutines. 475 // Calling resume on a dispatcher that was never paused is a no-op. 476 func (d *dispatcher) resume() { 477 d.mu.Lock() 478 479 if !d.paused { 480 // If this was a restore run the stack is a new instance and 481 // it was never paused, so just return as there is nothing to 482 // resume. 483 d.mu.Unlock() 484 return 485 } 486 d.paused = false 487 d.mu.Unlock() 488 for i := range d.processors { 489 d.processors[i].resume() 490 } 491 } 492 493 // jenkinsHasher contains state needed to for a jenkins hash. 494 type jenkinsHasher struct { 495 seed uint32 496 } 497 498 // hash hashes the provided TransportEndpointID using the jenkins hash 499 // algorithm. 500 func (j jenkinsHasher) hash(id stack.TransportEndpointID) uint32 { 501 var payload [4]byte 502 binary.LittleEndian.PutUint16(payload[0:], id.LocalPort) 503 binary.LittleEndian.PutUint16(payload[2:], id.RemotePort) 504 505 h := jenkins.Sum32(j.seed) 506 h.Write(payload[:]) 507 h.Write(id.LocalAddress.AsSlice()) 508 h.Write(id.RemoteAddress.AsSlice()) 509 return h.Sum32() 510 }