github.com/lightlus/netstack@v1.2.0/tcpip/transport/tcp/rcv.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "container/heap" 19 "time" 20 21 "github.com/lightlus/netstack/tcpip" 22 "github.com/lightlus/netstack/tcpip/header" 23 "github.com/lightlus/netstack/tcpip/seqnum" 24 ) 25 26 // receiver holds the state necessary to receive TCP segments and turn them 27 // into a stream of bytes. 28 // 29 // +stateify savable 30 type receiver struct { 31 ep *endpoint 32 33 rcvNxt seqnum.Value 34 35 // rcvAcc is one beyond the last acceptable sequence number. That is, 36 // the "largest" sequence value that the receiver has announced to the 37 // its peer that it's willing to accept. This may be different than 38 // rcvNxt + rcvWnd if the receive window is reduced; in that case we 39 // have to reduce the window as we receive more data instead of 40 // shrinking it. 41 rcvAcc seqnum.Value 42 43 // rcvWnd is the non-scaled receive window last advertised to the peer. 44 rcvWnd seqnum.Size 45 46 rcvWndScale uint8 47 48 closed bool 49 50 pendingRcvdSegments segmentHeap 51 pendingBufUsed seqnum.Size 52 pendingBufSize seqnum.Size 53 } 54 55 func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale uint8, pendingBufSize seqnum.Size) *receiver { 56 return &receiver{ 57 ep: ep, 58 rcvNxt: irs + 1, 59 rcvAcc: irs.Add(rcvWnd + 1), 60 rcvWnd: rcvWnd, 61 rcvWndScale: rcvWndScale, 62 pendingBufSize: pendingBufSize, 63 } 64 } 65 66 // acceptable checks if the segment sequence number range is acceptable 67 // according to the table on page 26 of RFC 793. 68 func (r *receiver) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool { 69 rcvWnd := r.rcvNxt.Size(r.rcvAcc) 70 if rcvWnd == 0 { 71 return segLen == 0 && segSeq == r.rcvNxt 72 } 73 74 return segSeq.InWindow(r.rcvNxt, rcvWnd) || 75 seqnum.Overlap(r.rcvNxt, rcvWnd, segSeq, segLen) 76 } 77 78 // getSendParams returns the parameters needed by the sender when building 79 // segments to send. 80 func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) { 81 // Calculate the window size based on the available buffer space. 82 receiveBufferAvailable := r.ep.receiveBufferAvailable() 83 acc := r.rcvNxt.Add(seqnum.Size(receiveBufferAvailable)) 84 if r.rcvAcc.LessThan(acc) { 85 r.rcvAcc = acc 86 } 87 // Stash away the non-scaled receive window as we use it for measuring 88 // receiver's estimated RTT. 89 r.rcvWnd = r.rcvNxt.Size(r.rcvAcc) 90 return r.rcvNxt, r.rcvWnd >> r.rcvWndScale 91 } 92 93 // nonZeroWindow is called when the receive window grows from zero to nonzero; 94 // in such cases we may need to send an ack to indicate to our peer that it can 95 // resume sending data. 96 func (r *receiver) nonZeroWindow() { 97 if (r.rcvAcc-r.rcvNxt)>>r.rcvWndScale != 0 { 98 // We never got around to announcing a zero window size, so we 99 // don't need to immediately announce a nonzero one. 100 return 101 } 102 103 // Immediately send an ack. 104 r.ep.snd.sendAck() 105 } 106 107 // consumeSegment attempts to consume a segment that was received by r. The 108 // segment may have just been received or may have been received earlier but 109 // wasn't ready to be consumed then. 110 // 111 // Returns true if the segment was consumed, false if it cannot be consumed 112 // yet because of a missing segment. 113 func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum.Size) bool { 114 if segLen > 0 { 115 // If the segment doesn't include the seqnum we're expecting to 116 // consume now, we're missing a segment. We cannot proceed until 117 // we receive that segment though. 118 if !r.rcvNxt.InWindow(segSeq, segLen) { 119 return false 120 } 121 122 // Trim segment to eliminate already acknowledged data. 123 if segSeq.LessThan(r.rcvNxt) { 124 diff := segSeq.Size(r.rcvNxt) 125 segLen -= diff 126 segSeq.UpdateForward(diff) 127 s.sequenceNumber.UpdateForward(diff) 128 s.data.TrimFront(int(diff)) 129 } 130 131 // Move segment to ready-to-deliver list. Wakeup any waiters. 132 r.ep.readyToRead(s) 133 134 } else if segSeq != r.rcvNxt { 135 return false 136 } 137 138 // Update the segment that we're expecting to consume. 139 r.rcvNxt = segSeq.Add(segLen) 140 141 // In cases of a misbehaving sender which could send more than the 142 // advertised window, we could end up in a situation where we get a 143 // segment that exceeds the window advertised. Instead of partially 144 // accepting the segment and discarding bytes beyond the advertised 145 // window, we accept the whole segment and make sure r.rcvAcc is moved 146 // forward to match r.rcvNxt to indicate that the window is now closed. 147 // 148 // In absence of this check the r.acceptable() check fails and accepts 149 // segments that should be dropped because rcvWnd is calculated as 150 // the size of the interval (rcvNxt, rcvAcc] which becomes extremely 151 // large if rcvAcc is ever less than rcvNxt. 152 if r.rcvAcc.LessThan(r.rcvNxt) { 153 r.rcvAcc = r.rcvNxt 154 } 155 156 // Trim SACK Blocks to remove any SACK information that covers 157 // sequence numbers that have been consumed. 158 TrimSACKBlockList(&r.ep.sack, r.rcvNxt) 159 160 // Handle FIN or FIN-ACK. 161 if s.flagIsSet(header.TCPFlagFin) { 162 r.rcvNxt++ 163 164 // Send ACK immediately. 165 r.ep.snd.sendAck() 166 167 // Tell any readers that no more data will come. 168 r.closed = true 169 r.ep.readyToRead(nil) 170 171 // We just received a FIN, our next state depends on whether we sent a 172 // FIN already or not. 173 r.ep.mu.Lock() 174 switch r.ep.state { 175 case StateEstablished: 176 r.ep.state = StateCloseWait 177 case StateFinWait1: 178 if s.flagIsSet(header.TCPFlagAck) { 179 // FIN-ACK, transition to TIME-WAIT. 180 r.ep.state = StateTimeWait 181 } else { 182 // Simultaneous close, expecting a final ACK. 183 r.ep.state = StateClosing 184 } 185 case StateFinWait2: 186 r.ep.state = StateTimeWait 187 } 188 r.ep.mu.Unlock() 189 190 // Flush out any pending segments, except the very first one if 191 // it happens to be the one we're handling now because the 192 // caller is using it. 193 first := 0 194 if len(r.pendingRcvdSegments) != 0 && r.pendingRcvdSegments[0] == s { 195 first = 1 196 } 197 198 for i := first; i < len(r.pendingRcvdSegments); i++ { 199 r.pendingRcvdSegments[i].decRef() 200 } 201 r.pendingRcvdSegments = r.pendingRcvdSegments[:first] 202 203 return true 204 } 205 206 // Handle ACK (not FIN-ACK, which we handled above) during one of the 207 // shutdown states. 208 if s.flagIsSet(header.TCPFlagAck) { 209 r.ep.mu.Lock() 210 switch r.ep.state { 211 case StateFinWait1: 212 r.ep.state = StateFinWait2 213 // Notify protocol goroutine that we have received an 214 // ACK to our FIN so that it can start the FIN_WAIT2 215 // timer to abort connection if the other side does 216 // not close within 2MSL. 217 r.ep.notifyProtocolGoroutine(notifyClose) 218 case StateClosing: 219 r.ep.state = StateTimeWait 220 case StateLastAck: 221 r.ep.transitionToStateCloseLocked() 222 } 223 r.ep.mu.Unlock() 224 } 225 226 return true 227 } 228 229 // updateRTT updates the receiver RTT measurement based on the sequence number 230 // of the received segment. 231 func (r *receiver) updateRTT() { 232 // From: https://public.lanl.gov/radiant/pubs/drs/sc2001-poster.pdf 233 // 234 // A system that is only transmitting acknowledgements can still 235 // estimate the round-trip time by observing the time between when a byte 236 // is first acknowledged and the receipt of data that is at least one 237 // window beyond the sequence number that was acknowledged. 238 r.ep.rcvListMu.Lock() 239 if r.ep.rcvAutoParams.rttMeasureTime.IsZero() { 240 // New measurement. 241 r.ep.rcvAutoParams.rttMeasureTime = time.Now() 242 r.ep.rcvAutoParams.rttMeasureSeqNumber = r.rcvNxt.Add(r.rcvWnd) 243 r.ep.rcvListMu.Unlock() 244 return 245 } 246 if r.rcvNxt.LessThan(r.ep.rcvAutoParams.rttMeasureSeqNumber) { 247 r.ep.rcvListMu.Unlock() 248 return 249 } 250 rtt := time.Since(r.ep.rcvAutoParams.rttMeasureTime) 251 // We only store the minimum observed RTT here as this is only used in 252 // absence of a SRTT available from either timestamps or a sender 253 // measurement of RTT. 254 if r.ep.rcvAutoParams.rtt == 0 || rtt < r.ep.rcvAutoParams.rtt { 255 r.ep.rcvAutoParams.rtt = rtt 256 } 257 r.ep.rcvAutoParams.rttMeasureTime = time.Now() 258 r.ep.rcvAutoParams.rttMeasureSeqNumber = r.rcvNxt.Add(r.rcvWnd) 259 r.ep.rcvListMu.Unlock() 260 } 261 262 func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, closed bool) (drop bool, err *tcpip.Error) { 263 r.ep.rcvListMu.Lock() 264 rcvClosed := r.ep.rcvClosed || r.closed 265 r.ep.rcvListMu.Unlock() 266 267 // If we are in one of the shutdown states then we need to do 268 // additional checks before we try and process the segment. 269 switch state { 270 case StateCloseWait, StateClosing, StateLastAck: 271 if !s.sequenceNumber.LessThanEq(r.rcvNxt) { 272 s.decRef() 273 // Just drop the segment as we have 274 // already received a FIN and this 275 // segment is after the sequence number 276 // for the FIN. 277 return true, nil 278 } 279 fallthrough 280 case StateFinWait1: 281 fallthrough 282 case StateFinWait2: 283 // If we are closed for reads (either due to an 284 // incoming FIN or the user calling shutdown(.., 285 // SHUT_RD) then any data past the rcvNxt should 286 // trigger a RST. 287 endDataSeq := s.sequenceNumber.Add(seqnum.Size(s.data.Size())) 288 if rcvClosed && r.rcvNxt.LessThan(endDataSeq) { 289 s.decRef() 290 return true, tcpip.ErrConnectionAborted 291 } 292 if state == StateFinWait1 { 293 break 294 } 295 296 // If it's a retransmission of an old data segment 297 // or a pure ACK then allow it. 298 if s.sequenceNumber.Add(s.logicalLen()).LessThanEq(r.rcvNxt) || 299 s.logicalLen() == 0 { 300 break 301 } 302 303 // In FIN-WAIT2 if the socket is fully 304 // closed(not owned by application on our end 305 // then the only acceptable segment is a 306 // FIN. Since FIN can technically also carry 307 // data we verify that the segment carrying a 308 // FIN ends at exactly e.rcvNxt+1. 309 // 310 // From RFC793 page 25. 311 // 312 // For sequence number purposes, the SYN is 313 // considered to occur before the first actual 314 // data octet of the segment in which it occurs, 315 // while the FIN is considered to occur after 316 // the last actual data octet in a segment in 317 // which it occurs. 318 if closed && (!s.flagIsSet(header.TCPFlagFin) || s.sequenceNumber.Add(s.logicalLen()) != r.rcvNxt+1) { 319 s.decRef() 320 return true, tcpip.ErrConnectionAborted 321 } 322 } 323 324 // We don't care about receive processing anymore if the receive side 325 // is closed. 326 // 327 // NOTE: We still want to permit a FIN as it's possible only our 328 // end has closed and the peer is yet to send a FIN. Hence we 329 // compare only the payload. 330 segEnd := s.sequenceNumber.Add(seqnum.Size(s.data.Size())) 331 if rcvClosed && !segEnd.LessThanEq(r.rcvNxt) { 332 return true, nil 333 } 334 return false, nil 335 } 336 337 // handleRcvdSegment handles TCP segments directed at the connection managed by 338 // r as they arrive. It is called by the protocol main loop. 339 func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) { 340 r.ep.mu.RLock() 341 state := r.ep.state 342 closed := r.ep.closed 343 r.ep.mu.RUnlock() 344 345 if state != StateEstablished { 346 drop, err := r.handleRcvdSegmentClosing(s, state, closed) 347 if drop || err != nil { 348 return drop, err 349 } 350 } 351 352 segLen := seqnum.Size(s.data.Size()) 353 segSeq := s.sequenceNumber 354 355 // If the sequence number range is outside the acceptable range, just 356 // send an ACK and stop further processing of the segment. 357 // This is according to RFC 793, page 68. 358 if !r.acceptable(segSeq, segLen) { 359 r.ep.snd.sendAck() 360 return true, nil 361 } 362 363 // Defer segment processing if it can't be consumed now. 364 if !r.consumeSegment(s, segSeq, segLen) { 365 if segLen > 0 || s.flagIsSet(header.TCPFlagFin) { 366 // We only store the segment if it's within our buffer 367 // size limit. 368 if r.pendingBufUsed < r.pendingBufSize { 369 r.pendingBufUsed += s.logicalLen() 370 s.incRef() 371 heap.Push(&r.pendingRcvdSegments, s) 372 UpdateSACKBlocks(&r.ep.sack, segSeq, segSeq.Add(segLen), r.rcvNxt) 373 } 374 375 // Immediately send an ack so that the peer knows it may 376 // have to retransmit. 377 r.ep.snd.sendAck() 378 } 379 return false, nil 380 } 381 382 // Since we consumed a segment update the receiver's RTT estimate 383 // if required. 384 if segLen > 0 { 385 r.updateRTT() 386 } 387 388 // By consuming the current segment, we may have filled a gap in the 389 // sequence number domain that allows pending segments to be consumed 390 // now. So try to do it. 391 for !r.closed && r.pendingRcvdSegments.Len() > 0 { 392 s := r.pendingRcvdSegments[0] 393 segLen := seqnum.Size(s.data.Size()) 394 segSeq := s.sequenceNumber 395 396 // Skip segment altogether if it has already been acknowledged. 397 if !segSeq.Add(segLen-1).LessThan(r.rcvNxt) && 398 !r.consumeSegment(s, segSeq, segLen) { 399 break 400 } 401 402 heap.Pop(&r.pendingRcvdSegments) 403 r.pendingBufUsed -= s.logicalLen() 404 s.decRef() 405 } 406 return false, nil 407 } 408 409 // handleTimeWaitSegment handles inbound segments received when the endpoint 410 // has entered the TIME_WAIT state. 411 func (r *receiver) handleTimeWaitSegment(s *segment) (resetTimeWait bool, newSyn bool) { 412 segSeq := s.sequenceNumber 413 segLen := seqnum.Size(s.data.Size()) 414 415 // Just silently drop any RST packets in TIME_WAIT. We do not support 416 // TIME_WAIT assasination as a result we confirm w/ fix 1 as described 417 // in https://tools.ietf.org/html/rfc1337#section-3. 418 if s.flagIsSet(header.TCPFlagRst) { 419 return false, false 420 } 421 422 // If it's a SYN and the sequence number is higher than any seen before 423 // for this connection then try and redirect it to a listening endpoint 424 // if available. 425 // 426 // RFC 1122: 427 // "When a connection is [...] on TIME-WAIT state [...] 428 // [a TCP] MAY accept a new SYN from the remote TCP to 429 // reopen the connection directly, if it: 430 431 // (1) assigns its initial sequence number for the new 432 // connection to be larger than the largest sequence 433 // number it used on the previous connection incarnation, 434 // and 435 436 // (2) returns to TIME-WAIT state if the SYN turns out 437 // to be an old duplicate". 438 if s.flagIsSet(header.TCPFlagSyn) && r.rcvNxt.LessThan(segSeq) { 439 440 return false, true 441 } 442 443 // Drop the segment if it does not contain an ACK. 444 if !s.flagIsSet(header.TCPFlagAck) { 445 return false, false 446 } 447 448 // Update Timestamp if required. See RFC7323, section-4.3. 449 if r.ep.sendTSOk && s.parsedOptions.TS { 450 r.ep.updateRecentTimestamp(s.parsedOptions.TSVal, r.ep.snd.maxSentAck, segSeq) 451 } 452 453 if segSeq.Add(1) == r.rcvNxt && s.flagIsSet(header.TCPFlagFin) { 454 // If it's a FIN-ACK then resetTimeWait and send an ACK, as it 455 // indicates our final ACK could have been lost. 456 r.ep.snd.sendAck() 457 return true, false 458 } 459 460 // If the sequence number range is outside the acceptable range or 461 // carries data then just send an ACK. This is according to RFC 793, 462 // page 37. 463 // 464 // NOTE: In TIME_WAIT the only acceptable sequence number is rcvNxt. 465 if segSeq != r.rcvNxt || segLen != 0 { 466 r.ep.snd.sendAck() 467 } 468 return false, false 469 }