github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/tcpip/transport/tcp/protocol.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package tcp contains the implementation of the TCP transport protocol. 16 package tcp 17 18 import ( 19 "runtime" 20 "strings" 21 "time" 22 23 "github.com/SagerNet/gvisor/pkg/sync" 24 "github.com/SagerNet/gvisor/pkg/tcpip" 25 "github.com/SagerNet/gvisor/pkg/tcpip/buffer" 26 "github.com/SagerNet/gvisor/pkg/tcpip/header" 27 "github.com/SagerNet/gvisor/pkg/tcpip/header/parse" 28 "github.com/SagerNet/gvisor/pkg/tcpip/seqnum" 29 "github.com/SagerNet/gvisor/pkg/tcpip/stack" 30 "github.com/SagerNet/gvisor/pkg/tcpip/transport/raw" 31 "github.com/SagerNet/gvisor/pkg/waiter" 32 ) 33 34 const ( 35 // ProtocolNumber is the tcp protocol number. 36 ProtocolNumber = header.TCPProtocolNumber 37 38 // MinBufferSize is the smallest size of a receive or send buffer. 39 MinBufferSize = 4 << 10 // 4096 bytes. 40 41 // DefaultSendBufferSize is the default size of the send buffer for 42 // an endpoint. 43 DefaultSendBufferSize = 1 << 20 // 1MB 44 45 // DefaultReceiveBufferSize is the default size of the receive buffer 46 // for an endpoint. 47 DefaultReceiveBufferSize = 1 << 20 // 1MB 48 49 // MaxBufferSize is the largest size a receive/send buffer can grow to. 50 MaxBufferSize = 4 << 20 // 4MB 51 52 // MaxUnprocessedSegments is the maximum number of unprocessed segments 53 // that can be queued for a given endpoint. 54 MaxUnprocessedSegments = 300 55 56 // DefaultTCPLingerTimeout is the amount of time that sockets linger in 57 // FIN_WAIT_2 state before being marked closed. 58 DefaultTCPLingerTimeout = 60 * time.Second 59 60 // MaxTCPLingerTimeout is the maximum amount of time that sockets 61 // linger in FIN_WAIT_2 state before being marked closed. 62 MaxTCPLingerTimeout = 120 * time.Second 63 64 // DefaultTCPTimeWaitTimeout is the amount of time that sockets linger 65 // in TIME_WAIT state before being marked closed. 66 DefaultTCPTimeWaitTimeout = 60 * time.Second 67 68 // DefaultSynRetries is the default value for the number of SYN retransmits 69 // before a connect is aborted. 70 DefaultSynRetries = 6 71 ) 72 73 const ( 74 ccReno = "reno" 75 ccCubic = "cubic" 76 ) 77 78 type protocol struct { 79 stack *stack.Stack 80 81 mu sync.RWMutex 82 sackEnabled bool 83 recovery tcpip.TCPRecovery 84 delayEnabled bool 85 alwaysUseSynCookies bool 86 sendBufferSize tcpip.TCPSendBufferSizeRangeOption 87 recvBufferSize tcpip.TCPReceiveBufferSizeRangeOption 88 congestionControl string 89 availableCongestionControl []string 90 moderateReceiveBuffer bool 91 lingerTimeout time.Duration 92 timeWaitTimeout time.Duration 93 timeWaitReuse tcpip.TCPTimeWaitReuseOption 94 minRTO time.Duration 95 maxRTO time.Duration 96 maxRetries uint32 97 synRetries uint8 98 dispatcher dispatcher 99 } 100 101 // Number returns the tcp protocol number. 102 func (*protocol) Number() tcpip.TransportProtocolNumber { 103 return ProtocolNumber 104 } 105 106 // NewEndpoint creates a new tcp endpoint. 107 func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { 108 return newEndpoint(p.stack, netProto, waiterQueue), nil 109 } 110 111 // NewRawEndpoint creates a new raw TCP endpoint. Raw TCP sockets are currently 112 // unsupported. It implements stack.TransportProtocol.NewRawEndpoint. 113 func (p *protocol) NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { 114 return raw.NewEndpoint(p.stack, netProto, header.TCPProtocolNumber, waiterQueue) 115 } 116 117 // MinimumPacketSize returns the minimum valid tcp packet size. 118 func (*protocol) MinimumPacketSize() int { 119 return header.TCPMinimumSize 120 } 121 122 // ParsePorts returns the source and destination ports stored in the given tcp 123 // packet. 124 func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err tcpip.Error) { 125 h := header.TCP(v) 126 return h.SourcePort(), h.DestinationPort(), nil 127 } 128 129 // QueuePacket queues packets targeted at an endpoint after hashing the packet 130 // to a specific processing queue. Each queue is serviced by its own processor 131 // goroutine which is responsible for dequeuing and doing full TCP dispatch of 132 // the packet. 133 func (p *protocol) QueuePacket(ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt *stack.PacketBuffer) { 134 p.dispatcher.queuePacket(ep, id, p.stack.Clock(), pkt) 135 } 136 137 // HandleUnknownDestinationPacket handles packets targeted at this protocol but 138 // that don't match any existing endpoint. 139 // 140 // RFC 793, page 36, states that "If the connection does not exist (CLOSED) then 141 // a reset is sent in response to any incoming segment except another reset. In 142 // particular, SYNs addressed to a non-existent connection are rejected by this 143 // means." 144 func (p *protocol) HandleUnknownDestinationPacket(id stack.TransportEndpointID, pkt *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition { 145 s := newIncomingSegment(id, p.stack.Clock(), pkt) 146 defer s.decRef() 147 148 if !s.parse(pkt.RXTransportChecksumValidated) || !s.csumValid { 149 return stack.UnknownDestinationPacketMalformed 150 } 151 152 if !s.flags.Contains(header.TCPFlagRst) { 153 replyWithReset(p.stack, s, stack.DefaultTOS, 0) 154 } 155 156 return stack.UnknownDestinationPacketHandled 157 } 158 159 // replyWithReset replies to the given segment with a reset segment. 160 // 161 // If the passed TTL is 0, then the route's default TTL will be used. 162 func replyWithReset(st *stack.Stack, s *segment, tos, ttl uint8) tcpip.Error { 163 route, err := st.FindRoute(s.nicID, s.dstAddr, s.srcAddr, s.netProto, false /* multicastLoop */) 164 if err != nil { 165 return err 166 } 167 defer route.Release() 168 169 // Get the seqnum from the packet if the ack flag is set. 170 seq := seqnum.Value(0) 171 ack := seqnum.Value(0) 172 flags := header.TCPFlagRst 173 // As per RFC 793 page 35 (Reset Generation) 174 // 1. If the connection does not exist (CLOSED) then a reset is sent 175 // in response to any incoming segment except another reset. In 176 // particular, SYNs addressed to a non-existent connection are rejected 177 // by this means. 178 179 // If the incoming segment has an ACK field, the reset takes its 180 // sequence number from the ACK field of the segment, otherwise the 181 // reset has sequence number zero and the ACK field is set to the sum 182 // of the sequence number and segment length of the incoming segment. 183 // The connection remains in the CLOSED state. 184 if s.flags.Contains(header.TCPFlagAck) { 185 seq = s.ackNumber 186 } else { 187 flags |= header.TCPFlagAck 188 ack = s.sequenceNumber.Add(s.logicalLen()) 189 } 190 191 if ttl == 0 { 192 ttl = route.DefaultTTL() 193 } 194 195 return sendTCP(route, tcpFields{ 196 id: s.id, 197 ttl: ttl, 198 tos: tos, 199 flags: flags, 200 seq: seq, 201 ack: ack, 202 rcvWnd: 0, 203 }, buffer.VectorisedView{}, stack.GSO{}, nil /* PacketOwner */) 204 } 205 206 // SetOption implements stack.TransportProtocol.SetOption. 207 func (p *protocol) SetOption(option tcpip.SettableTransportProtocolOption) tcpip.Error { 208 switch v := option.(type) { 209 case *tcpip.TCPSACKEnabled: 210 p.mu.Lock() 211 p.sackEnabled = bool(*v) 212 p.mu.Unlock() 213 return nil 214 215 case *tcpip.TCPRecovery: 216 p.mu.Lock() 217 p.recovery = *v 218 p.mu.Unlock() 219 return nil 220 221 case *tcpip.TCPDelayEnabled: 222 p.mu.Lock() 223 p.delayEnabled = bool(*v) 224 p.mu.Unlock() 225 return nil 226 227 case *tcpip.TCPSendBufferSizeRangeOption: 228 if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max { 229 return &tcpip.ErrInvalidOptionValue{} 230 } 231 p.mu.Lock() 232 p.sendBufferSize = *v 233 p.mu.Unlock() 234 return nil 235 236 case *tcpip.TCPReceiveBufferSizeRangeOption: 237 if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max { 238 return &tcpip.ErrInvalidOptionValue{} 239 } 240 p.mu.Lock() 241 p.recvBufferSize = *v 242 p.mu.Unlock() 243 return nil 244 245 case *tcpip.CongestionControlOption: 246 for _, c := range p.availableCongestionControl { 247 if string(*v) == c { 248 p.mu.Lock() 249 p.congestionControl = string(*v) 250 p.mu.Unlock() 251 return nil 252 } 253 } 254 // linux returns ENOENT when an invalid congestion control 255 // is specified. 256 return &tcpip.ErrNoSuchFile{} 257 258 case *tcpip.TCPModerateReceiveBufferOption: 259 p.mu.Lock() 260 p.moderateReceiveBuffer = bool(*v) 261 p.mu.Unlock() 262 return nil 263 264 case *tcpip.TCPLingerTimeoutOption: 265 p.mu.Lock() 266 if *v < 0 { 267 p.lingerTimeout = 0 268 } else { 269 p.lingerTimeout = time.Duration(*v) 270 } 271 p.mu.Unlock() 272 return nil 273 274 case *tcpip.TCPTimeWaitTimeoutOption: 275 p.mu.Lock() 276 if *v < 0 { 277 p.timeWaitTimeout = 0 278 } else { 279 p.timeWaitTimeout = time.Duration(*v) 280 } 281 p.mu.Unlock() 282 return nil 283 284 case *tcpip.TCPTimeWaitReuseOption: 285 if *v < tcpip.TCPTimeWaitReuseDisabled || *v > tcpip.TCPTimeWaitReuseLoopbackOnly { 286 return &tcpip.ErrInvalidOptionValue{} 287 } 288 p.mu.Lock() 289 p.timeWaitReuse = *v 290 p.mu.Unlock() 291 return nil 292 293 case *tcpip.TCPMinRTOOption: 294 p.mu.Lock() 295 if *v < 0 { 296 p.minRTO = MinRTO 297 } else { 298 p.minRTO = time.Duration(*v) 299 } 300 p.mu.Unlock() 301 return nil 302 303 case *tcpip.TCPMaxRTOOption: 304 p.mu.Lock() 305 if *v < 0 { 306 p.maxRTO = MaxRTO 307 } else { 308 p.maxRTO = time.Duration(*v) 309 } 310 p.mu.Unlock() 311 return nil 312 313 case *tcpip.TCPMaxRetriesOption: 314 p.mu.Lock() 315 p.maxRetries = uint32(*v) 316 p.mu.Unlock() 317 return nil 318 319 case *tcpip.TCPAlwaysUseSynCookies: 320 p.mu.Lock() 321 p.alwaysUseSynCookies = bool(*v) 322 p.mu.Unlock() 323 return nil 324 325 case *tcpip.TCPSynRetriesOption: 326 if *v < 1 || *v > 255 { 327 return &tcpip.ErrInvalidOptionValue{} 328 } 329 p.mu.Lock() 330 p.synRetries = uint8(*v) 331 p.mu.Unlock() 332 return nil 333 334 default: 335 return &tcpip.ErrUnknownProtocolOption{} 336 } 337 } 338 339 // Option implements stack.TransportProtocol.Option. 340 func (p *protocol) Option(option tcpip.GettableTransportProtocolOption) tcpip.Error { 341 switch v := option.(type) { 342 case *tcpip.TCPSACKEnabled: 343 p.mu.RLock() 344 *v = tcpip.TCPSACKEnabled(p.sackEnabled) 345 p.mu.RUnlock() 346 return nil 347 348 case *tcpip.TCPRecovery: 349 p.mu.RLock() 350 *v = p.recovery 351 p.mu.RUnlock() 352 return nil 353 354 case *tcpip.TCPDelayEnabled: 355 p.mu.RLock() 356 *v = tcpip.TCPDelayEnabled(p.delayEnabled) 357 p.mu.RUnlock() 358 return nil 359 360 case *tcpip.TCPSendBufferSizeRangeOption: 361 p.mu.RLock() 362 *v = p.sendBufferSize 363 p.mu.RUnlock() 364 return nil 365 366 case *tcpip.TCPReceiveBufferSizeRangeOption: 367 p.mu.RLock() 368 *v = p.recvBufferSize 369 p.mu.RUnlock() 370 return nil 371 372 case *tcpip.CongestionControlOption: 373 p.mu.RLock() 374 *v = tcpip.CongestionControlOption(p.congestionControl) 375 p.mu.RUnlock() 376 return nil 377 378 case *tcpip.TCPAvailableCongestionControlOption: 379 p.mu.RLock() 380 *v = tcpip.TCPAvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " ")) 381 p.mu.RUnlock() 382 return nil 383 384 case *tcpip.TCPModerateReceiveBufferOption: 385 p.mu.RLock() 386 *v = tcpip.TCPModerateReceiveBufferOption(p.moderateReceiveBuffer) 387 p.mu.RUnlock() 388 return nil 389 390 case *tcpip.TCPLingerTimeoutOption: 391 p.mu.RLock() 392 *v = tcpip.TCPLingerTimeoutOption(p.lingerTimeout) 393 p.mu.RUnlock() 394 return nil 395 396 case *tcpip.TCPTimeWaitTimeoutOption: 397 p.mu.RLock() 398 *v = tcpip.TCPTimeWaitTimeoutOption(p.timeWaitTimeout) 399 p.mu.RUnlock() 400 return nil 401 402 case *tcpip.TCPTimeWaitReuseOption: 403 p.mu.RLock() 404 *v = p.timeWaitReuse 405 p.mu.RUnlock() 406 return nil 407 408 case *tcpip.TCPMinRTOOption: 409 p.mu.RLock() 410 *v = tcpip.TCPMinRTOOption(p.minRTO) 411 p.mu.RUnlock() 412 return nil 413 414 case *tcpip.TCPMaxRTOOption: 415 p.mu.RLock() 416 *v = tcpip.TCPMaxRTOOption(p.maxRTO) 417 p.mu.RUnlock() 418 return nil 419 420 case *tcpip.TCPMaxRetriesOption: 421 p.mu.RLock() 422 *v = tcpip.TCPMaxRetriesOption(p.maxRetries) 423 p.mu.RUnlock() 424 return nil 425 426 case *tcpip.TCPAlwaysUseSynCookies: 427 p.mu.RLock() 428 *v = tcpip.TCPAlwaysUseSynCookies(p.alwaysUseSynCookies) 429 p.mu.RUnlock() 430 return nil 431 432 case *tcpip.TCPSynRetriesOption: 433 p.mu.RLock() 434 *v = tcpip.TCPSynRetriesOption(p.synRetries) 435 p.mu.RUnlock() 436 return nil 437 438 default: 439 return &tcpip.ErrUnknownProtocolOption{} 440 } 441 } 442 443 // Close implements stack.TransportProtocol.Close. 444 func (p *protocol) Close() { 445 p.dispatcher.close() 446 } 447 448 // Wait implements stack.TransportProtocol.Wait. 449 func (p *protocol) Wait() { 450 p.dispatcher.wait() 451 } 452 453 // Parse implements stack.TransportProtocol.Parse. 454 func (*protocol) Parse(pkt *stack.PacketBuffer) bool { 455 return parse.TCP(pkt) 456 } 457 458 // NewProtocol returns a TCP transport protocol. 459 func NewProtocol(s *stack.Stack) stack.TransportProtocol { 460 p := protocol{ 461 stack: s, 462 sendBufferSize: tcpip.TCPSendBufferSizeRangeOption{ 463 Min: MinBufferSize, 464 Default: DefaultSendBufferSize, 465 Max: MaxBufferSize, 466 }, 467 recvBufferSize: tcpip.TCPReceiveBufferSizeRangeOption{ 468 Min: MinBufferSize, 469 Default: DefaultReceiveBufferSize, 470 Max: MaxBufferSize, 471 }, 472 congestionControl: ccReno, 473 availableCongestionControl: []string{ccReno, ccCubic}, 474 lingerTimeout: DefaultTCPLingerTimeout, 475 timeWaitTimeout: DefaultTCPTimeWaitTimeout, 476 timeWaitReuse: tcpip.TCPTimeWaitReuseLoopbackOnly, 477 synRetries: DefaultSynRetries, 478 minRTO: MinRTO, 479 maxRTO: MaxRTO, 480 maxRetries: MaxRetries, 481 // TODO(github.com/SagerNet/issue/5243): Set recovery to tcpip.TCPRACKLossDetection. 482 recovery: 0, 483 } 484 p.dispatcher.init(s.Rand(), runtime.GOMAXPROCS(0)) 485 return &p 486 }