inet.af/netstack@v0.0.0-20220214151720-7585b01ddccf/tcpip/transport/tcp/protocol.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package tcp contains the implementation of the TCP transport protocol. 16 package tcp 17 18 import ( 19 "runtime" 20 "strings" 21 "time" 22 23 "inet.af/netstack/sync" 24 "inet.af/netstack/tcpip" 25 "inet.af/netstack/tcpip/buffer" 26 "inet.af/netstack/tcpip/hash/jenkins" 27 "inet.af/netstack/tcpip/header" 28 "inet.af/netstack/tcpip/header/parse" 29 "inet.af/netstack/tcpip/internal/tcp" 30 "inet.af/netstack/tcpip/seqnum" 31 "inet.af/netstack/tcpip/stack" 32 "inet.af/netstack/tcpip/transport/raw" 33 "inet.af/netstack/waiter" 34 ) 35 36 const ( 37 // ProtocolNumber is the tcp protocol number. 38 ProtocolNumber = header.TCPProtocolNumber 39 40 // MinBufferSize is the smallest size of a receive or send buffer. 41 MinBufferSize = 4 << 10 // 4096 bytes. 42 43 // DefaultSendBufferSize is the default size of the send buffer for 44 // an endpoint. 45 DefaultSendBufferSize = 1 << 20 // 1MB 46 47 // DefaultReceiveBufferSize is the default size of the receive buffer 48 // for an endpoint. 49 DefaultReceiveBufferSize = 1 << 20 // 1MB 50 51 // MaxBufferSize is the largest size a receive/send buffer can grow to. 52 MaxBufferSize = 4 << 20 // 4MB 53 54 // DefaultTCPLingerTimeout is the amount of time that sockets linger in 55 // FIN_WAIT_2 state before being marked closed. 56 DefaultTCPLingerTimeout = 60 * time.Second 57 58 // MaxTCPLingerTimeout is the maximum amount of time that sockets 59 // linger in FIN_WAIT_2 state before being marked closed. 60 MaxTCPLingerTimeout = 120 * time.Second 61 62 // DefaultTCPTimeWaitTimeout is the amount of time that sockets linger 63 // in TIME_WAIT state before being marked closed. 64 DefaultTCPTimeWaitTimeout = 60 * time.Second 65 66 // DefaultSynRetries is the default value for the number of SYN retransmits 67 // before a connect is aborted. 68 DefaultSynRetries = 6 69 70 // DefaultKeepaliveIdle is the idle time for a connection before keep-alive 71 // probes are sent. 72 DefaultKeepaliveIdle = 2 * time.Hour 73 74 // DefaultKeepaliveInterval is the time between two successive keep-alive 75 // probes. 76 DefaultKeepaliveInterval = 75 * time.Second 77 78 // DefaultKeepaliveCount is the number of keep-alive probes that are sent 79 // before declaring the connection dead. 80 DefaultKeepaliveCount = 9 81 ) 82 83 const ( 84 ccReno = "reno" 85 ccCubic = "cubic" 86 ) 87 88 type protocol struct { 89 stack *stack.Stack 90 91 mu sync.RWMutex 92 sackEnabled bool 93 recovery tcpip.TCPRecovery 94 delayEnabled bool 95 alwaysUseSynCookies bool 96 sendBufferSize tcpip.TCPSendBufferSizeRangeOption 97 recvBufferSize tcpip.TCPReceiveBufferSizeRangeOption 98 congestionControl string 99 availableCongestionControl []string 100 moderateReceiveBuffer bool 101 lingerTimeout time.Duration 102 timeWaitTimeout time.Duration 103 timeWaitReuse tcpip.TCPTimeWaitReuseOption 104 minRTO time.Duration 105 maxRTO time.Duration 106 maxRetries uint32 107 synRetries uint8 108 dispatcher dispatcher 109 110 // The following secrets are initialized once and stay unchanged after. 111 seqnumSecret uint32 112 portOffsetSecret uint32 113 tsOffsetSecret uint32 114 } 115 116 // Number returns the tcp protocol number. 117 func (*protocol) Number() tcpip.TransportProtocolNumber { 118 return ProtocolNumber 119 } 120 121 // NewEndpoint creates a new tcp endpoint. 122 func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { 123 return newEndpoint(p.stack, p, netProto, waiterQueue), nil 124 } 125 126 // NewRawEndpoint creates a new raw TCP endpoint. Raw TCP sockets are currently 127 // unsupported. It implements stack.TransportProtocol.NewRawEndpoint. 128 func (p *protocol) NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { 129 return raw.NewEndpoint(p.stack, netProto, header.TCPProtocolNumber, waiterQueue) 130 } 131 132 // MinimumPacketSize returns the minimum valid tcp packet size. 133 func (*protocol) MinimumPacketSize() int { 134 return header.TCPMinimumSize 135 } 136 137 // ParsePorts returns the source and destination ports stored in the given tcp 138 // packet. 139 func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err tcpip.Error) { 140 h := header.TCP(v) 141 return h.SourcePort(), h.DestinationPort(), nil 142 } 143 144 // QueuePacket queues packets targeted at an endpoint after hashing the packet 145 // to a specific processing queue. Each queue is serviced by its own processor 146 // goroutine which is responsible for dequeuing and doing full TCP dispatch of 147 // the packet. 148 func (p *protocol) QueuePacket(ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt *stack.PacketBuffer) { 149 p.dispatcher.queuePacket(ep, id, p.stack.Clock(), pkt) 150 } 151 152 // HandleUnknownDestinationPacket handles packets targeted at this protocol but 153 // that don't match any existing endpoint. 154 // 155 // RFC 793, page 36, states that "If the connection does not exist (CLOSED) then 156 // a reset is sent in response to any incoming segment except another reset. In 157 // particular, SYNs addressed to a non-existent connection are rejected by this 158 // means." 159 func (p *protocol) HandleUnknownDestinationPacket(id stack.TransportEndpointID, pkt *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition { 160 s := newIncomingSegment(id, p.stack.Clock(), pkt) 161 defer s.decRef() 162 163 if !s.parse(pkt.RXTransportChecksumValidated) || !s.csumValid { 164 return stack.UnknownDestinationPacketMalformed 165 } 166 167 if !s.flags.Contains(header.TCPFlagRst) { 168 replyWithReset(p.stack, s, stack.DefaultTOS, 0) 169 } 170 171 return stack.UnknownDestinationPacketHandled 172 } 173 174 func (p *protocol) tsOffset(src, dst tcpip.Address) tcp.TSOffset { 175 // Initialize a random tsOffset that will be added to the recentTS 176 // everytime the timestamp is sent when the Timestamp option is enabled. 177 // 178 // See https://tools.ietf.org/html/rfc7323#section-5.4 for details on 179 // why this is required. 180 // 181 // TODO(https://gvisor.dev/issues/6473): This is not really secure as 182 // it does not use the recommended algorithm linked above. 183 h := jenkins.Sum32(p.tsOffsetSecret) 184 // Per hash.Hash.Writer: 185 // 186 // It never returns an error. 187 _, _ = h.Write([]byte(src)) 188 _, _ = h.Write([]byte(dst)) 189 return tcp.NewTSOffset(h.Sum32()) 190 } 191 192 // replyWithReset replies to the given segment with a reset segment. 193 // 194 // If the passed TTL is 0, then the route's default TTL will be used. 195 func replyWithReset(st *stack.Stack, s *segment, tos, ttl uint8) tcpip.Error { 196 route, err := st.FindRoute(s.nicID, s.dstAddr, s.srcAddr, s.netProto, false /* multicastLoop */) 197 if err != nil { 198 return err 199 } 200 defer route.Release() 201 202 // Get the seqnum from the packet if the ack flag is set. 203 seq := seqnum.Value(0) 204 ack := seqnum.Value(0) 205 flags := header.TCPFlagRst 206 // As per RFC 793 page 35 (Reset Generation) 207 // 1. If the connection does not exist (CLOSED) then a reset is sent 208 // in response to any incoming segment except another reset. In 209 // particular, SYNs addressed to a non-existent connection are rejected 210 // by this means. 211 212 // If the incoming segment has an ACK field, the reset takes its 213 // sequence number from the ACK field of the segment, otherwise the 214 // reset has sequence number zero and the ACK field is set to the sum 215 // of the sequence number and segment length of the incoming segment. 216 // The connection remains in the CLOSED state. 217 if s.flags.Contains(header.TCPFlagAck) { 218 seq = s.ackNumber 219 } else { 220 flags |= header.TCPFlagAck 221 ack = s.sequenceNumber.Add(s.logicalLen()) 222 } 223 224 if ttl == 0 { 225 ttl = route.DefaultTTL() 226 } 227 228 return sendTCP(route, tcpFields{ 229 id: s.id, 230 ttl: ttl, 231 tos: tos, 232 flags: flags, 233 seq: seq, 234 ack: ack, 235 rcvWnd: 0, 236 }, buffer.VectorisedView{}, stack.GSO{}, nil /* PacketOwner */) 237 } 238 239 // SetOption implements stack.TransportProtocol.SetOption. 240 func (p *protocol) SetOption(option tcpip.SettableTransportProtocolOption) tcpip.Error { 241 switch v := option.(type) { 242 case *tcpip.TCPSACKEnabled: 243 p.mu.Lock() 244 p.sackEnabled = bool(*v) 245 p.mu.Unlock() 246 return nil 247 248 case *tcpip.TCPRecovery: 249 p.mu.Lock() 250 p.recovery = *v 251 p.mu.Unlock() 252 return nil 253 254 case *tcpip.TCPDelayEnabled: 255 p.mu.Lock() 256 p.delayEnabled = bool(*v) 257 p.mu.Unlock() 258 return nil 259 260 case *tcpip.TCPSendBufferSizeRangeOption: 261 if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max { 262 return &tcpip.ErrInvalidOptionValue{} 263 } 264 p.mu.Lock() 265 p.sendBufferSize = *v 266 p.mu.Unlock() 267 return nil 268 269 case *tcpip.TCPReceiveBufferSizeRangeOption: 270 if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max { 271 return &tcpip.ErrInvalidOptionValue{} 272 } 273 p.mu.Lock() 274 p.recvBufferSize = *v 275 p.mu.Unlock() 276 return nil 277 278 case *tcpip.CongestionControlOption: 279 for _, c := range p.availableCongestionControl { 280 if string(*v) == c { 281 p.mu.Lock() 282 p.congestionControl = string(*v) 283 p.mu.Unlock() 284 return nil 285 } 286 } 287 // linux returns ENOENT when an invalid congestion control 288 // is specified. 289 return &tcpip.ErrNoSuchFile{} 290 291 case *tcpip.TCPModerateReceiveBufferOption: 292 p.mu.Lock() 293 p.moderateReceiveBuffer = bool(*v) 294 p.mu.Unlock() 295 return nil 296 297 case *tcpip.TCPLingerTimeoutOption: 298 p.mu.Lock() 299 if *v < 0 { 300 p.lingerTimeout = 0 301 } else { 302 p.lingerTimeout = time.Duration(*v) 303 } 304 p.mu.Unlock() 305 return nil 306 307 case *tcpip.TCPTimeWaitTimeoutOption: 308 p.mu.Lock() 309 if *v < 0 { 310 p.timeWaitTimeout = 0 311 } else { 312 p.timeWaitTimeout = time.Duration(*v) 313 } 314 p.mu.Unlock() 315 return nil 316 317 case *tcpip.TCPTimeWaitReuseOption: 318 if *v < tcpip.TCPTimeWaitReuseDisabled || *v > tcpip.TCPTimeWaitReuseLoopbackOnly { 319 return &tcpip.ErrInvalidOptionValue{} 320 } 321 p.mu.Lock() 322 p.timeWaitReuse = *v 323 p.mu.Unlock() 324 return nil 325 326 case *tcpip.TCPMinRTOOption: 327 p.mu.Lock() 328 defer p.mu.Unlock() 329 if *v < 0 { 330 p.minRTO = MinRTO 331 } else if minRTO := time.Duration(*v); minRTO <= p.maxRTO { 332 p.minRTO = minRTO 333 } else { 334 return &tcpip.ErrInvalidOptionValue{} 335 } 336 return nil 337 338 case *tcpip.TCPMaxRTOOption: 339 p.mu.Lock() 340 defer p.mu.Unlock() 341 if *v < 0 { 342 p.maxRTO = MaxRTO 343 } else if maxRTO := time.Duration(*v); maxRTO >= p.minRTO { 344 p.maxRTO = maxRTO 345 } else { 346 return &tcpip.ErrInvalidOptionValue{} 347 } 348 return nil 349 350 case *tcpip.TCPMaxRetriesOption: 351 p.mu.Lock() 352 p.maxRetries = uint32(*v) 353 p.mu.Unlock() 354 return nil 355 356 case *tcpip.TCPAlwaysUseSynCookies: 357 p.mu.Lock() 358 p.alwaysUseSynCookies = bool(*v) 359 p.mu.Unlock() 360 return nil 361 362 case *tcpip.TCPSynRetriesOption: 363 if *v < 1 || *v > 255 { 364 return &tcpip.ErrInvalidOptionValue{} 365 } 366 p.mu.Lock() 367 p.synRetries = uint8(*v) 368 p.mu.Unlock() 369 return nil 370 371 default: 372 return &tcpip.ErrUnknownProtocolOption{} 373 } 374 } 375 376 // Option implements stack.TransportProtocol.Option. 377 func (p *protocol) Option(option tcpip.GettableTransportProtocolOption) tcpip.Error { 378 switch v := option.(type) { 379 case *tcpip.TCPSACKEnabled: 380 p.mu.RLock() 381 *v = tcpip.TCPSACKEnabled(p.sackEnabled) 382 p.mu.RUnlock() 383 return nil 384 385 case *tcpip.TCPRecovery: 386 p.mu.RLock() 387 *v = p.recovery 388 p.mu.RUnlock() 389 return nil 390 391 case *tcpip.TCPDelayEnabled: 392 p.mu.RLock() 393 *v = tcpip.TCPDelayEnabled(p.delayEnabled) 394 p.mu.RUnlock() 395 return nil 396 397 case *tcpip.TCPSendBufferSizeRangeOption: 398 p.mu.RLock() 399 *v = p.sendBufferSize 400 p.mu.RUnlock() 401 return nil 402 403 case *tcpip.TCPReceiveBufferSizeRangeOption: 404 p.mu.RLock() 405 *v = p.recvBufferSize 406 p.mu.RUnlock() 407 return nil 408 409 case *tcpip.CongestionControlOption: 410 p.mu.RLock() 411 *v = tcpip.CongestionControlOption(p.congestionControl) 412 p.mu.RUnlock() 413 return nil 414 415 case *tcpip.TCPAvailableCongestionControlOption: 416 p.mu.RLock() 417 *v = tcpip.TCPAvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " ")) 418 p.mu.RUnlock() 419 return nil 420 421 case *tcpip.TCPModerateReceiveBufferOption: 422 p.mu.RLock() 423 *v = tcpip.TCPModerateReceiveBufferOption(p.moderateReceiveBuffer) 424 p.mu.RUnlock() 425 return nil 426 427 case *tcpip.TCPLingerTimeoutOption: 428 p.mu.RLock() 429 *v = tcpip.TCPLingerTimeoutOption(p.lingerTimeout) 430 p.mu.RUnlock() 431 return nil 432 433 case *tcpip.TCPTimeWaitTimeoutOption: 434 p.mu.RLock() 435 *v = tcpip.TCPTimeWaitTimeoutOption(p.timeWaitTimeout) 436 p.mu.RUnlock() 437 return nil 438 439 case *tcpip.TCPTimeWaitReuseOption: 440 p.mu.RLock() 441 *v = p.timeWaitReuse 442 p.mu.RUnlock() 443 return nil 444 445 case *tcpip.TCPMinRTOOption: 446 p.mu.RLock() 447 *v = tcpip.TCPMinRTOOption(p.minRTO) 448 p.mu.RUnlock() 449 return nil 450 451 case *tcpip.TCPMaxRTOOption: 452 p.mu.RLock() 453 *v = tcpip.TCPMaxRTOOption(p.maxRTO) 454 p.mu.RUnlock() 455 return nil 456 457 case *tcpip.TCPMaxRetriesOption: 458 p.mu.RLock() 459 *v = tcpip.TCPMaxRetriesOption(p.maxRetries) 460 p.mu.RUnlock() 461 return nil 462 463 case *tcpip.TCPAlwaysUseSynCookies: 464 p.mu.RLock() 465 *v = tcpip.TCPAlwaysUseSynCookies(p.alwaysUseSynCookies) 466 p.mu.RUnlock() 467 return nil 468 469 case *tcpip.TCPSynRetriesOption: 470 p.mu.RLock() 471 *v = tcpip.TCPSynRetriesOption(p.synRetries) 472 p.mu.RUnlock() 473 return nil 474 475 default: 476 return &tcpip.ErrUnknownProtocolOption{} 477 } 478 } 479 480 // Close implements stack.TransportProtocol.Close. 481 func (p *protocol) Close() { 482 p.dispatcher.close() 483 } 484 485 // Wait implements stack.TransportProtocol.Wait. 486 func (p *protocol) Wait() { 487 p.dispatcher.wait() 488 } 489 490 // Parse implements stack.TransportProtocol.Parse. 491 func (*protocol) Parse(pkt *stack.PacketBuffer) bool { 492 return parse.TCP(pkt) 493 } 494 495 // NewProtocol returns a TCP transport protocol. 496 func NewProtocol(s *stack.Stack) stack.TransportProtocol { 497 p := protocol{ 498 stack: s, 499 sendBufferSize: tcpip.TCPSendBufferSizeRangeOption{ 500 Min: MinBufferSize, 501 Default: DefaultSendBufferSize, 502 Max: MaxBufferSize, 503 }, 504 recvBufferSize: tcpip.TCPReceiveBufferSizeRangeOption{ 505 Min: MinBufferSize, 506 Default: DefaultReceiveBufferSize, 507 Max: MaxBufferSize, 508 }, 509 congestionControl: ccReno, 510 availableCongestionControl: []string{ccReno, ccCubic}, 511 lingerTimeout: DefaultTCPLingerTimeout, 512 timeWaitTimeout: DefaultTCPTimeWaitTimeout, 513 timeWaitReuse: tcpip.TCPTimeWaitReuseLoopbackOnly, 514 synRetries: DefaultSynRetries, 515 minRTO: MinRTO, 516 maxRTO: MaxRTO, 517 maxRetries: MaxRetries, 518 recovery: tcpip.TCPRACKLossDetection, 519 seqnumSecret: s.Rand().Uint32(), 520 portOffsetSecret: s.Rand().Uint32(), 521 tsOffsetSecret: s.Rand().Uint32(), 522 } 523 p.dispatcher.init(s.Rand(), runtime.GOMAXPROCS(0)) 524 return &p 525 } 526 527 // protocolFromStack retrieves the tcp.protocol instance from stack s. 528 func protocolFromStack(s *stack.Stack) *protocol { 529 return s.TransportProtocolInstance(ProtocolNumber).(*protocol) 530 }