gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/tcpip/transport/tcp/protocol.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package tcp contains the implementation of the TCP transport protocol. 16 package tcp 17 18 import ( 19 "crypto/sha256" 20 "encoding/binary" 21 "fmt" 22 "runtime" 23 "strings" 24 "time" 25 26 "gvisor.dev/gvisor/pkg/sync" 27 "gvisor.dev/gvisor/pkg/tcpip" 28 "gvisor.dev/gvisor/pkg/tcpip/header" 29 "gvisor.dev/gvisor/pkg/tcpip/header/parse" 30 "gvisor.dev/gvisor/pkg/tcpip/internal/tcp" 31 "gvisor.dev/gvisor/pkg/tcpip/seqnum" 32 "gvisor.dev/gvisor/pkg/tcpip/stack" 33 "gvisor.dev/gvisor/pkg/tcpip/transport/raw" 34 "gvisor.dev/gvisor/pkg/waiter" 35 ) 36 37 const ( 38 // ProtocolNumber is the tcp protocol number. 39 ProtocolNumber = header.TCPProtocolNumber 40 41 // MinBufferSize is the smallest size of a receive or send buffer. 42 MinBufferSize = 4 << 10 // 4096 bytes. 43 44 // DefaultSendBufferSize is the default size of the send buffer for 45 // an endpoint. 46 DefaultSendBufferSize = 1 << 20 // 1MB 47 48 // DefaultReceiveBufferSize is the default size of the receive buffer 49 // for an endpoint. 50 DefaultReceiveBufferSize = 1 << 20 // 1MB 51 52 // MaxBufferSize is the largest size a receive/send buffer can grow to. 53 MaxBufferSize = 4 << 20 // 4MB 54 55 // DefaultTCPLingerTimeout is the amount of time that sockets linger in 56 // FIN_WAIT_2 state before being marked closed. 57 DefaultTCPLingerTimeout = 60 * time.Second 58 59 // MaxTCPLingerTimeout is the maximum amount of time that sockets 60 // linger in FIN_WAIT_2 state before being marked closed. 61 MaxTCPLingerTimeout = 120 * time.Second 62 63 // DefaultTCPTimeWaitTimeout is the amount of time that sockets linger 64 // in TIME_WAIT state before being marked closed. 65 DefaultTCPTimeWaitTimeout = 60 * time.Second 66 67 // DefaultSynRetries is the default value for the number of SYN retransmits 68 // before a connect is aborted. 69 DefaultSynRetries = 6 70 71 // DefaultKeepaliveIdle is the idle time for a connection before keep-alive 72 // probes are sent. 73 DefaultKeepaliveIdle = 2 * time.Hour 74 75 // DefaultKeepaliveInterval is the time between two successive keep-alive 76 // probes. 77 DefaultKeepaliveInterval = 75 * time.Second 78 79 // DefaultKeepaliveCount is the number of keep-alive probes that are sent 80 // before declaring the connection dead. 81 DefaultKeepaliveCount = 9 82 ) 83 84 const ( 85 ccReno = "reno" 86 ccCubic = "cubic" 87 ) 88 89 type protocol struct { 90 stack *stack.Stack 91 92 mu sync.RWMutex 93 sackEnabled bool 94 recovery tcpip.TCPRecovery 95 delayEnabled bool 96 alwaysUseSynCookies bool 97 sendBufferSize tcpip.TCPSendBufferSizeRangeOption 98 recvBufferSize tcpip.TCPReceiveBufferSizeRangeOption 99 congestionControl string 100 availableCongestionControl []string 101 moderateReceiveBuffer bool 102 lingerTimeout time.Duration 103 timeWaitTimeout time.Duration 104 timeWaitReuse tcpip.TCPTimeWaitReuseOption 105 minRTO time.Duration 106 maxRTO time.Duration 107 maxRetries uint32 108 synRetries uint8 109 dispatcher dispatcher 110 111 // The following secrets are initialized once and stay unchanged after. 112 seqnumSecret [16]byte 113 tsOffsetSecret [16]byte 114 } 115 116 // Number returns the tcp protocol number. 117 func (*protocol) Number() tcpip.TransportProtocolNumber { 118 return ProtocolNumber 119 } 120 121 // NewEndpoint creates a new tcp endpoint. 122 func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { 123 return newEndpoint(p.stack, p, netProto, waiterQueue), nil 124 } 125 126 // NewRawEndpoint creates a new raw TCP endpoint. Raw TCP sockets are currently 127 // unsupported. It implements stack.TransportProtocol.NewRawEndpoint. 128 func (p *protocol) NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { 129 return raw.NewEndpoint(p.stack, netProto, header.TCPProtocolNumber, waiterQueue) 130 } 131 132 // MinimumPacketSize returns the minimum valid tcp packet size. 133 func (*protocol) MinimumPacketSize() int { 134 return header.TCPMinimumSize 135 } 136 137 // ParsePorts returns the source and destination ports stored in the given tcp 138 // packet. 139 func (*protocol) ParsePorts(v []byte) (src, dst uint16, err tcpip.Error) { 140 h := header.TCP(v) 141 return h.SourcePort(), h.DestinationPort(), nil 142 } 143 144 // QueuePacket queues packets targeted at an endpoint after hashing the packet 145 // to a specific processing queue. Each queue is serviced by its own processor 146 // goroutine which is responsible for dequeuing and doing full TCP dispatch of 147 // the packet. 148 func (p *protocol) QueuePacket(ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt *stack.PacketBuffer) { 149 p.dispatcher.queuePacket(ep, id, p.stack.Clock(), pkt) 150 } 151 152 // HandleUnknownDestinationPacket handles packets targeted at this protocol but 153 // that don't match any existing endpoint. 154 // 155 // RFC 793, page 36, states that "If the connection does not exist (CLOSED) then 156 // a reset is sent in response to any incoming segment except another reset. In 157 // particular, SYNs addressed to a non-existent connection are rejected by this 158 // means." 159 func (p *protocol) HandleUnknownDestinationPacket(id stack.TransportEndpointID, pkt *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition { 160 s, err := newIncomingSegment(id, p.stack.Clock(), pkt) 161 if err != nil { 162 return stack.UnknownDestinationPacketMalformed 163 } 164 defer s.DecRef() 165 if !s.csumValid { 166 return stack.UnknownDestinationPacketMalformed 167 } 168 169 if !s.flags.Contains(header.TCPFlagRst) { 170 replyWithReset(p.stack, s, stack.DefaultTOS, tcpip.UseDefaultIPv4TTL, tcpip.UseDefaultIPv6HopLimit) 171 } 172 173 return stack.UnknownDestinationPacketHandled 174 } 175 176 func (p *protocol) tsOffset(src, dst tcpip.Address) tcp.TSOffset { 177 // Initialize a random tsOffset that will be added to the recentTS 178 // everytime the timestamp is sent when the Timestamp option is enabled. 179 // 180 // See https://tools.ietf.org/html/rfc7323#section-5.4 for details on 181 // why this is required. 182 h := sha256.New() 183 184 // Per hash.Hash.Writer: 185 // 186 // It never returns an error. 187 _, _ = h.Write(p.tsOffsetSecret[:]) 188 _, _ = h.Write(src.AsSlice()) 189 _, _ = h.Write(dst.AsSlice()) 190 return tcp.NewTSOffset(binary.LittleEndian.Uint32(h.Sum(nil)[:4])) 191 } 192 193 // replyWithReset replies to the given segment with a reset segment. 194 // 195 // If the relevant TTL has its reset value (0 for ipv4TTL, -1 for ipv6HopLimit), 196 // then the route's default TTL will be used. 197 func replyWithReset(st *stack.Stack, s *segment, tos, ipv4TTL uint8, ipv6HopLimit int16) tcpip.Error { 198 net := s.pkt.Network() 199 route, err := st.FindRoute(s.pkt.NICID, net.DestinationAddress(), net.SourceAddress(), s.pkt.NetworkProtocolNumber, false /* multicastLoop */) 200 if err != nil { 201 return err 202 } 203 defer route.Release() 204 205 ttl := calculateTTL(route, ipv4TTL, ipv6HopLimit) 206 207 // Get the seqnum from the packet if the ack flag is set. 208 seq := seqnum.Value(0) 209 ack := seqnum.Value(0) 210 flags := header.TCPFlagRst 211 // As per RFC 793 page 35 (Reset Generation) 212 // 1. If the connection does not exist (CLOSED) then a reset is sent 213 // in response to any incoming segment except another reset. In 214 // particular, SYNs addressed to a non-existent connection are rejected 215 // by this means. 216 217 // If the incoming segment has an ACK field, the reset takes its 218 // sequence number from the ACK field of the segment, otherwise the 219 // reset has sequence number zero and the ACK field is set to the sum 220 // of the sequence number and segment length of the incoming segment. 221 // The connection remains in the CLOSED state. 222 if s.flags.Contains(header.TCPFlagAck) { 223 seq = s.ackNumber 224 } else { 225 flags |= header.TCPFlagAck 226 ack = s.sequenceNumber.Add(s.logicalLen()) 227 } 228 229 p := stack.NewPacketBuffer(stack.PacketBufferOptions{ReserveHeaderBytes: header.TCPMinimumSize + int(route.MaxHeaderLength())}) 230 defer p.DecRef() 231 return sendTCP(route, tcpFields{ 232 id: s.id, 233 ttl: ttl, 234 tos: tos, 235 flags: flags, 236 seq: seq, 237 ack: ack, 238 rcvWnd: 0, 239 }, p, stack.GSO{}, nil /* PacketOwner */) 240 } 241 242 // SetOption implements stack.TransportProtocol.SetOption. 243 func (p *protocol) SetOption(option tcpip.SettableTransportProtocolOption) tcpip.Error { 244 switch v := option.(type) { 245 case *tcpip.TCPSACKEnabled: 246 p.mu.Lock() 247 p.sackEnabled = bool(*v) 248 p.mu.Unlock() 249 return nil 250 251 case *tcpip.TCPRecovery: 252 p.mu.Lock() 253 p.recovery = *v 254 p.mu.Unlock() 255 return nil 256 257 case *tcpip.TCPDelayEnabled: 258 p.mu.Lock() 259 p.delayEnabled = bool(*v) 260 p.mu.Unlock() 261 return nil 262 263 case *tcpip.TCPSendBufferSizeRangeOption: 264 if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max { 265 return &tcpip.ErrInvalidOptionValue{} 266 } 267 p.mu.Lock() 268 p.sendBufferSize = *v 269 p.mu.Unlock() 270 return nil 271 272 case *tcpip.TCPReceiveBufferSizeRangeOption: 273 if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max { 274 return &tcpip.ErrInvalidOptionValue{} 275 } 276 p.mu.Lock() 277 p.recvBufferSize = *v 278 p.mu.Unlock() 279 return nil 280 281 case *tcpip.CongestionControlOption: 282 for _, c := range p.availableCongestionControl { 283 if string(*v) == c { 284 p.mu.Lock() 285 p.congestionControl = string(*v) 286 p.mu.Unlock() 287 return nil 288 } 289 } 290 // linux returns ENOENT when an invalid congestion control 291 // is specified. 292 return &tcpip.ErrNoSuchFile{} 293 294 case *tcpip.TCPModerateReceiveBufferOption: 295 p.mu.Lock() 296 p.moderateReceiveBuffer = bool(*v) 297 p.mu.Unlock() 298 return nil 299 300 case *tcpip.TCPLingerTimeoutOption: 301 p.mu.Lock() 302 if *v < 0 { 303 p.lingerTimeout = 0 304 } else { 305 p.lingerTimeout = time.Duration(*v) 306 } 307 p.mu.Unlock() 308 return nil 309 310 case *tcpip.TCPTimeWaitTimeoutOption: 311 p.mu.Lock() 312 if *v < 0 { 313 p.timeWaitTimeout = 0 314 } else { 315 p.timeWaitTimeout = time.Duration(*v) 316 } 317 p.mu.Unlock() 318 return nil 319 320 case *tcpip.TCPTimeWaitReuseOption: 321 if *v < tcpip.TCPTimeWaitReuseDisabled || *v > tcpip.TCPTimeWaitReuseLoopbackOnly { 322 return &tcpip.ErrInvalidOptionValue{} 323 } 324 p.mu.Lock() 325 p.timeWaitReuse = *v 326 p.mu.Unlock() 327 return nil 328 329 case *tcpip.TCPMinRTOOption: 330 p.mu.Lock() 331 defer p.mu.Unlock() 332 if *v < 0 { 333 p.minRTO = MinRTO 334 } else if minRTO := time.Duration(*v); minRTO <= p.maxRTO { 335 p.minRTO = minRTO 336 } else { 337 return &tcpip.ErrInvalidOptionValue{} 338 } 339 return nil 340 341 case *tcpip.TCPMaxRTOOption: 342 p.mu.Lock() 343 defer p.mu.Unlock() 344 if *v < 0 { 345 p.maxRTO = MaxRTO 346 } else if maxRTO := time.Duration(*v); maxRTO >= p.minRTO { 347 p.maxRTO = maxRTO 348 } else { 349 return &tcpip.ErrInvalidOptionValue{} 350 } 351 return nil 352 353 case *tcpip.TCPMaxRetriesOption: 354 p.mu.Lock() 355 p.maxRetries = uint32(*v) 356 p.mu.Unlock() 357 return nil 358 359 case *tcpip.TCPAlwaysUseSynCookies: 360 p.mu.Lock() 361 p.alwaysUseSynCookies = bool(*v) 362 p.mu.Unlock() 363 return nil 364 365 case *tcpip.TCPSynRetriesOption: 366 if *v < 1 || *v > 255 { 367 return &tcpip.ErrInvalidOptionValue{} 368 } 369 p.mu.Lock() 370 p.synRetries = uint8(*v) 371 p.mu.Unlock() 372 return nil 373 374 default: 375 return &tcpip.ErrUnknownProtocolOption{} 376 } 377 } 378 379 // Option implements stack.TransportProtocol.Option. 380 func (p *protocol) Option(option tcpip.GettableTransportProtocolOption) tcpip.Error { 381 switch v := option.(type) { 382 case *tcpip.TCPSACKEnabled: 383 p.mu.RLock() 384 *v = tcpip.TCPSACKEnabled(p.sackEnabled) 385 p.mu.RUnlock() 386 return nil 387 388 case *tcpip.TCPRecovery: 389 p.mu.RLock() 390 *v = p.recovery 391 p.mu.RUnlock() 392 return nil 393 394 case *tcpip.TCPDelayEnabled: 395 p.mu.RLock() 396 *v = tcpip.TCPDelayEnabled(p.delayEnabled) 397 p.mu.RUnlock() 398 return nil 399 400 case *tcpip.TCPSendBufferSizeRangeOption: 401 p.mu.RLock() 402 *v = p.sendBufferSize 403 p.mu.RUnlock() 404 return nil 405 406 case *tcpip.TCPReceiveBufferSizeRangeOption: 407 p.mu.RLock() 408 *v = p.recvBufferSize 409 p.mu.RUnlock() 410 return nil 411 412 case *tcpip.CongestionControlOption: 413 p.mu.RLock() 414 *v = tcpip.CongestionControlOption(p.congestionControl) 415 p.mu.RUnlock() 416 return nil 417 418 case *tcpip.TCPAvailableCongestionControlOption: 419 p.mu.RLock() 420 *v = tcpip.TCPAvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " ")) 421 p.mu.RUnlock() 422 return nil 423 424 case *tcpip.TCPModerateReceiveBufferOption: 425 p.mu.RLock() 426 *v = tcpip.TCPModerateReceiveBufferOption(p.moderateReceiveBuffer) 427 p.mu.RUnlock() 428 return nil 429 430 case *tcpip.TCPLingerTimeoutOption: 431 p.mu.RLock() 432 *v = tcpip.TCPLingerTimeoutOption(p.lingerTimeout) 433 p.mu.RUnlock() 434 return nil 435 436 case *tcpip.TCPTimeWaitTimeoutOption: 437 p.mu.RLock() 438 *v = tcpip.TCPTimeWaitTimeoutOption(p.timeWaitTimeout) 439 p.mu.RUnlock() 440 return nil 441 442 case *tcpip.TCPTimeWaitReuseOption: 443 p.mu.RLock() 444 *v = p.timeWaitReuse 445 p.mu.RUnlock() 446 return nil 447 448 case *tcpip.TCPMinRTOOption: 449 p.mu.RLock() 450 *v = tcpip.TCPMinRTOOption(p.minRTO) 451 p.mu.RUnlock() 452 return nil 453 454 case *tcpip.TCPMaxRTOOption: 455 p.mu.RLock() 456 *v = tcpip.TCPMaxRTOOption(p.maxRTO) 457 p.mu.RUnlock() 458 return nil 459 460 case *tcpip.TCPMaxRetriesOption: 461 p.mu.RLock() 462 *v = tcpip.TCPMaxRetriesOption(p.maxRetries) 463 p.mu.RUnlock() 464 return nil 465 466 case *tcpip.TCPAlwaysUseSynCookies: 467 p.mu.RLock() 468 *v = tcpip.TCPAlwaysUseSynCookies(p.alwaysUseSynCookies) 469 p.mu.RUnlock() 470 return nil 471 472 case *tcpip.TCPSynRetriesOption: 473 p.mu.RLock() 474 *v = tcpip.TCPSynRetriesOption(p.synRetries) 475 p.mu.RUnlock() 476 return nil 477 478 default: 479 return &tcpip.ErrUnknownProtocolOption{} 480 } 481 } 482 483 // SendBufferSize implements stack.SendBufSizeProto. 484 func (p *protocol) SendBufferSize() tcpip.TCPSendBufferSizeRangeOption { 485 p.mu.RLock() 486 defer p.mu.RUnlock() 487 return p.sendBufferSize 488 } 489 490 // Close implements stack.TransportProtocol.Close. 491 func (p *protocol) Close() { 492 p.dispatcher.close() 493 } 494 495 // Wait implements stack.TransportProtocol.Wait. 496 func (p *protocol) Wait() { 497 p.dispatcher.wait() 498 } 499 500 // Pause implements stack.TransportProtocol.Pause. 501 func (p *protocol) Pause() { 502 p.dispatcher.pause() 503 } 504 505 // Resume implements stack.TransportProtocol.Resume. 506 func (p *protocol) Resume() { 507 p.dispatcher.resume() 508 } 509 510 // Parse implements stack.TransportProtocol.Parse. 511 func (*protocol) Parse(pkt *stack.PacketBuffer) bool { 512 return parse.TCP(pkt) 513 } 514 515 // NewProtocol returns a TCP transport protocol. 516 func NewProtocol(s *stack.Stack) stack.TransportProtocol { 517 rng := s.SecureRNG() 518 var seqnumSecret [16]byte 519 var tsOffsetSecret [16]byte 520 if n, err := rng.Reader.Read(seqnumSecret[:]); err != nil || n != len(seqnumSecret) { 521 panic(fmt.Sprintf("Read() failed: %v", err)) 522 } 523 if n, err := rng.Reader.Read(tsOffsetSecret[:]); err != nil || n != len(tsOffsetSecret) { 524 panic(fmt.Sprintf("Read() failed: %v", err)) 525 } 526 p := protocol{ 527 stack: s, 528 sendBufferSize: tcpip.TCPSendBufferSizeRangeOption{ 529 Min: MinBufferSize, 530 Default: DefaultSendBufferSize, 531 Max: MaxBufferSize, 532 }, 533 recvBufferSize: tcpip.TCPReceiveBufferSizeRangeOption{ 534 Min: MinBufferSize, 535 Default: DefaultReceiveBufferSize, 536 Max: MaxBufferSize, 537 }, 538 sackEnabled: true, 539 congestionControl: ccReno, 540 availableCongestionControl: []string{ccReno, ccCubic}, 541 moderateReceiveBuffer: true, 542 lingerTimeout: DefaultTCPLingerTimeout, 543 timeWaitTimeout: DefaultTCPTimeWaitTimeout, 544 timeWaitReuse: tcpip.TCPTimeWaitReuseLoopbackOnly, 545 synRetries: DefaultSynRetries, 546 minRTO: MinRTO, 547 maxRTO: MaxRTO, 548 maxRetries: MaxRetries, 549 recovery: tcpip.TCPRACKLossDetection, 550 seqnumSecret: seqnumSecret, 551 tsOffsetSecret: tsOffsetSecret, 552 } 553 p.dispatcher.init(s.InsecureRNG(), runtime.GOMAXPROCS(0)) 554 return &p 555 } 556 557 // protocolFromStack retrieves the tcp.protocol instance from stack s. 558 func protocolFromStack(s *stack.Stack) *protocol { 559 return s.TransportProtocolInstance(ProtocolNumber).(*protocol) 560 }