github.com/flowerwrong/netstack@v0.0.0-20191009141956-e5848263af28/tcpip/stack/stack.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package stack provides the glue between networking protocols and the 16 // consumers of the networking stack. 17 // 18 // For consumers, the only function of interest is New(), everything else is 19 // provided by the tcpip/public package. 20 package stack 21 22 import ( 23 "encoding/binary" 24 "sync" 25 "time" 26 27 "github.com/FlowerWrong/netstack/rand" 28 "github.com/FlowerWrong/netstack/sleep" 29 "github.com/FlowerWrong/netstack/tcpip" 30 "github.com/FlowerWrong/netstack/tcpip/buffer" 31 "github.com/FlowerWrong/netstack/tcpip/header" 32 "github.com/FlowerWrong/netstack/tcpip/iptables" 33 "github.com/FlowerWrong/netstack/tcpip/ports" 34 "github.com/FlowerWrong/netstack/tcpip/seqnum" 35 "github.com/FlowerWrong/netstack/waiter" 36 "golang.org/x/time/rate" 37 ) 38 39 const ( 40 // ageLimit is set to the same cache stale time used in Linux. 41 ageLimit = 1 * time.Minute 42 // resolutionTimeout is set to the same ARP timeout used in Linux. 43 resolutionTimeout = 1 * time.Second 44 // resolutionAttempts is set to the same ARP retries used in Linux. 45 resolutionAttempts = 3 46 ) 47 48 type transportProtocolState struct { 49 proto TransportProtocol 50 defaultHandler func(r *Route, id TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) bool 51 } 52 53 // TCPProbeFunc is the expected function type for a TCP probe function to be 54 // passed to stack.AddTCPProbe. 55 type TCPProbeFunc func(s TCPEndpointState) 56 57 // TCPCubicState is used to hold a copy of the internal cubic state when the 58 // TCPProbeFunc is invoked. 59 type TCPCubicState struct { 60 WLastMax float64 61 WMax float64 62 T time.Time 63 TimeSinceLastCongestion time.Duration 64 C float64 65 K float64 66 Beta float64 67 WC float64 68 WEst float64 69 } 70 71 // TCPEndpointID is the unique 4 tuple that identifies a given endpoint. 72 type TCPEndpointID struct { 73 // LocalPort is the local port associated with the endpoint. 74 LocalPort uint16 75 76 // LocalAddress is the local [network layer] address associated with 77 // the endpoint. 78 LocalAddress tcpip.Address 79 80 // RemotePort is the remote port associated with the endpoint. 81 RemotePort uint16 82 83 // RemoteAddress it the remote [network layer] address associated with 84 // the endpoint. 85 RemoteAddress tcpip.Address 86 } 87 88 // TCPFastRecoveryState holds a copy of the internal fast recovery state of a 89 // TCP endpoint. 90 type TCPFastRecoveryState struct { 91 // Active if true indicates the endpoint is in fast recovery. 92 Active bool 93 94 // First is the first unacknowledged sequence number being recovered. 95 First seqnum.Value 96 97 // Last is the 'recover' sequence number that indicates the point at 98 // which we should exit recovery barring any timeouts etc. 99 Last seqnum.Value 100 101 // MaxCwnd is the maximum value we are permitted to grow the congestion 102 // window during recovery. This is set at the time we enter recovery. 103 MaxCwnd int 104 105 // HighRxt is the highest sequence number which has been retransmitted 106 // during the current loss recovery phase. 107 // See: RFC 6675 Section 2 for details. 108 HighRxt seqnum.Value 109 110 // RescueRxt is the highest sequence number which has been 111 // optimistically retransmitted to prevent stalling of the ACK clock 112 // when there is loss at the end of the window and no new data is 113 // available for transmission. 114 // See: RFC 6675 Section 2 for details. 115 RescueRxt seqnum.Value 116 } 117 118 // TCPReceiverState holds a copy of the internal state of the receiver for 119 // a given TCP endpoint. 120 type TCPReceiverState struct { 121 // RcvNxt is the TCP variable RCV.NXT. 122 RcvNxt seqnum.Value 123 124 // RcvAcc is the TCP variable RCV.ACC. 125 RcvAcc seqnum.Value 126 127 // RcvWndScale is the window scaling to use for inbound segments. 128 RcvWndScale uint8 129 130 // PendingBufUsed is the number of bytes pending in the receive 131 // queue. 132 PendingBufUsed seqnum.Size 133 134 // PendingBufSize is the size of the socket receive buffer. 135 PendingBufSize seqnum.Size 136 } 137 138 // TCPSenderState holds a copy of the internal state of the sender for 139 // a given TCP Endpoint. 140 type TCPSenderState struct { 141 // LastSendTime is the time at which we sent the last segment. 142 LastSendTime time.Time 143 144 // DupAckCount is the number of Duplicate ACK's received. 145 DupAckCount int 146 147 // SndCwnd is the size of the sending congestion window in packets. 148 SndCwnd int 149 150 // Ssthresh is the slow start threshold in packets. 151 Ssthresh int 152 153 // SndCAAckCount is the number of packets consumed in congestion 154 // avoidance mode. 155 SndCAAckCount int 156 157 // Outstanding is the number of packets in flight. 158 Outstanding int 159 160 // SndWnd is the send window size in bytes. 161 SndWnd seqnum.Size 162 163 // SndUna is the next unacknowledged sequence number. 164 SndUna seqnum.Value 165 166 // SndNxt is the sequence number of the next segment to be sent. 167 SndNxt seqnum.Value 168 169 // RTTMeasureSeqNum is the sequence number being used for the latest RTT 170 // measurement. 171 RTTMeasureSeqNum seqnum.Value 172 173 // RTTMeasureTime is the time when the RTTMeasureSeqNum was sent. 174 RTTMeasureTime time.Time 175 176 // Closed indicates that the caller has closed the endpoint for sending. 177 Closed bool 178 179 // SRTT is the smoothed round-trip time as defined in section 2 of 180 // RFC 6298. 181 SRTT time.Duration 182 183 // RTO is the retransmit timeout as defined in section of 2 of RFC 6298. 184 RTO time.Duration 185 186 // RTTVar is the round-trip time variation as defined in section 2 of 187 // RFC 6298. 188 RTTVar time.Duration 189 190 // SRTTInited if true indicates take a valid RTT measurement has been 191 // completed. 192 SRTTInited bool 193 194 // MaxPayloadSize is the maximum size of the payload of a given segment. 195 // It is initialized on demand. 196 MaxPayloadSize int 197 198 // SndWndScale is the number of bits to shift left when reading the send 199 // window size from a segment. 200 SndWndScale uint8 201 202 // MaxSentAck is the highest acknowledgement number sent till now. 203 MaxSentAck seqnum.Value 204 205 // FastRecovery holds the fast recovery state for the endpoint. 206 FastRecovery TCPFastRecoveryState 207 208 // Cubic holds the state related to CUBIC congestion control. 209 Cubic TCPCubicState 210 } 211 212 // TCPSACKInfo holds TCP SACK related information for a given TCP endpoint. 213 type TCPSACKInfo struct { 214 // Blocks is the list of SACK Blocks that identify the out of order segments 215 // held by a given TCP endpoint. 216 Blocks []header.SACKBlock 217 218 // ReceivedBlocks are the SACK blocks received by this endpoint 219 // from the peer endpoint. 220 ReceivedBlocks []header.SACKBlock 221 222 // MaxSACKED is the highest sequence number that has been SACKED 223 // by the peer. 224 MaxSACKED seqnum.Value 225 } 226 227 // RcvBufAutoTuneParams holds state related to TCP receive buffer auto-tuning. 228 type RcvBufAutoTuneParams struct { 229 // MeasureTime is the time at which the current measurement 230 // was started. 231 MeasureTime time.Time 232 233 // CopiedBytes is the number of bytes copied to user space since 234 // this measure began. 235 CopiedBytes int 236 237 // PrevCopiedBytes is the number of bytes copied to user space in 238 // the previous RTT period. 239 PrevCopiedBytes int 240 241 // RcvBufSize is the auto tuned receive buffer size. 242 RcvBufSize int 243 244 // RTT is the smoothed RTT as measured by observing the time between 245 // when a byte is first acknowledged and the receipt of data that is at 246 // least one window beyond the sequence number that was acknowledged. 247 RTT time.Duration 248 249 // RTTVar is the "round-trip time variation" as defined in section 2 250 // of RFC6298. 251 RTTVar time.Duration 252 253 // RTTMeasureSeqNumber is the highest acceptable sequence number at the 254 // time this RTT measurement period began. 255 RTTMeasureSeqNumber seqnum.Value 256 257 // RTTMeasureTime is the absolute time at which the current RTT 258 // measurement period began. 259 RTTMeasureTime time.Time 260 261 // Disabled is true if an explicit receive buffer is set for the 262 // endpoint. 263 Disabled bool 264 } 265 266 // TCPEndpointState is a copy of the internal state of a TCP endpoint. 267 type TCPEndpointState struct { 268 // ID is a copy of the TransportEndpointID for the endpoint. 269 ID TCPEndpointID 270 271 // SegTime denotes the absolute time when this segment was received. 272 SegTime time.Time 273 274 // RcvBufSize is the size of the receive socket buffer for the endpoint. 275 RcvBufSize int 276 277 // RcvBufUsed is the amount of bytes actually held in the receive socket 278 // buffer for the endpoint. 279 RcvBufUsed int 280 281 // RcvBufAutoTuneParams is used to hold state variables to compute 282 // the auto tuned receive buffer size. 283 RcvAutoParams RcvBufAutoTuneParams 284 285 // RcvClosed if true, indicates the endpoint has been closed for reading. 286 RcvClosed bool 287 288 // SendTSOk is used to indicate when the TS Option has been negotiated. 289 // When sendTSOk is true every non-RST segment should carry a TS as per 290 // RFC7323#section-1.1. 291 SendTSOk bool 292 293 // RecentTS is the timestamp that should be sent in the TSEcr field of 294 // the timestamp for future segments sent by the endpoint. This field is 295 // updated if required when a new segment is received by this endpoint. 296 RecentTS uint32 297 298 // TSOffset is a randomized offset added to the value of the TSVal field 299 // in the timestamp option. 300 TSOffset uint32 301 302 // SACKPermitted is set to true if the peer sends the TCPSACKPermitted 303 // option in the SYN/SYN-ACK. 304 SACKPermitted bool 305 306 // SACK holds TCP SACK related information for this endpoint. 307 SACK TCPSACKInfo 308 309 // SndBufSize is the size of the socket send buffer. 310 SndBufSize int 311 312 // SndBufUsed is the number of bytes held in the socket send buffer. 313 SndBufUsed int 314 315 // SndClosed indicates that the endpoint has been closed for sends. 316 SndClosed bool 317 318 // SndBufInQueue is the number of bytes in the send queue. 319 SndBufInQueue seqnum.Size 320 321 // PacketTooBigCount is used to notify the main protocol routine how 322 // many times a "packet too big" control packet is received. 323 PacketTooBigCount int 324 325 // SndMTU is the smallest MTU seen in the control packets received. 326 SndMTU int 327 328 // Receiver holds variables related to the TCP receiver for the endpoint. 329 Receiver TCPReceiverState 330 331 // Sender holds state related to the TCP Sender for the endpoint. 332 Sender TCPSenderState 333 } 334 335 // ResumableEndpoint is an endpoint that needs to be resumed after restore. 336 type ResumableEndpoint interface { 337 // Resume resumes an endpoint after restore. This can be used to restart 338 // background workers such as protocol goroutines. This must be called after 339 // all indirect dependencies of the endpoint has been restored, which 340 // generally implies at the end of the restore process. 341 Resume(*Stack) 342 } 343 344 // Stack is a networking stack, with all supported protocols, NICs, and route 345 // table. 346 type Stack struct { 347 transportProtocols map[tcpip.TransportProtocolNumber]*transportProtocolState 348 networkProtocols map[tcpip.NetworkProtocolNumber]NetworkProtocol 349 linkAddrResolvers map[tcpip.NetworkProtocolNumber]LinkAddressResolver 350 351 // unassociatedFactory creates unassociated endpoints. If nil, raw 352 // endpoints are disabled. It is set during Stack creation and is 353 // immutable. 354 unassociatedFactory UnassociatedEndpointFactory 355 356 demux *transportDemuxer 357 358 stats tcpip.Stats 359 360 linkAddrCache *linkAddrCache 361 362 mu sync.RWMutex 363 nics map[tcpip.NICID]*NIC 364 forwarding bool 365 366 // route is the route table passed in by the user via SetRouteTable(), 367 // it is used by FindRoute() to build a route for a specific 368 // destination. 369 routeTable []tcpip.Route 370 371 *ports.PortManager 372 373 // If not nil, then any new endpoints will have this probe function 374 // invoked everytime they receive a TCP segment. 375 tcpProbeFunc TCPProbeFunc 376 377 // clock is used to generate user-visible times. 378 clock tcpip.Clock 379 380 // handleLocal allows non-loopback interfaces to loop packets. 381 handleLocal bool 382 383 // tables are the iptables packet filtering and manipulation rules. 384 tables iptables.IPTables 385 386 // resumableEndpoints is a list of endpoints that need to be resumed if the 387 // stack is being restored. 388 resumableEndpoints []ResumableEndpoint 389 390 // icmpRateLimiter is a global rate limiter for all ICMP messages generated 391 // by the stack. 392 icmpRateLimiter *ICMPRateLimiter 393 394 // portSeed is a one-time random value initialized at stack startup 395 // and is used to seed the TCP port picking on active connections 396 // 397 // TODO(gvisor.dev/issues/940): S/R this field. 398 portSeed uint32 399 } 400 401 // Options contains optional Stack configuration. 402 type Options struct { 403 // NetworkProtocols lists the network protocols to enable. 404 NetworkProtocols []NetworkProtocol 405 406 // TransportProtocols lists the transport protocols to enable. 407 TransportProtocols []TransportProtocol 408 409 // Clock is an optional clock source used for timestampping packets. 410 // 411 // If no Clock is specified, the clock source will be time.Now. 412 Clock tcpip.Clock 413 414 // Stats are optional statistic counters. 415 Stats tcpip.Stats 416 417 // HandleLocal indicates whether packets destined to their source 418 // should be handled by the stack internally (true) or outside the 419 // stack (false). 420 HandleLocal bool 421 422 // UnassociatedFactory produces unassociated endpoints raw endpoints. 423 // Raw endpoints are enabled only if this is non-nil. 424 UnassociatedFactory UnassociatedEndpointFactory 425 } 426 427 // New allocates a new networking stack with only the requested networking and 428 // transport protocols configured with default options. 429 // 430 // Protocol options can be changed by calling the 431 // SetNetworkProtocolOption/SetTransportProtocolOption methods provided by the 432 // stack. Please refer to individual protocol implementations as to what options 433 // are supported. 434 func New(opts Options) *Stack { 435 clock := opts.Clock 436 if clock == nil { 437 clock = &tcpip.StdClock{} 438 } 439 440 s := &Stack{ 441 transportProtocols: make(map[tcpip.TransportProtocolNumber]*transportProtocolState), 442 networkProtocols: make(map[tcpip.NetworkProtocolNumber]NetworkProtocol), 443 linkAddrResolvers: make(map[tcpip.NetworkProtocolNumber]LinkAddressResolver), 444 nics: make(map[tcpip.NICID]*NIC), 445 linkAddrCache: newLinkAddrCache(ageLimit, resolutionTimeout, resolutionAttempts), 446 PortManager: ports.NewPortManager(), 447 clock: clock, 448 stats: opts.Stats.FillIn(), 449 handleLocal: opts.HandleLocal, 450 icmpRateLimiter: NewICMPRateLimiter(), 451 portSeed: generateRandUint32(), 452 } 453 454 // Add specified network protocols. 455 for _, netProto := range opts.NetworkProtocols { 456 s.networkProtocols[netProto.Number()] = netProto 457 if r, ok := netProto.(LinkAddressResolver); ok { 458 s.linkAddrResolvers[r.LinkAddressProtocol()] = r 459 } 460 } 461 462 // Add specified transport protocols. 463 for _, transProto := range opts.TransportProtocols { 464 s.transportProtocols[transProto.Number()] = &transportProtocolState{ 465 proto: transProto, 466 } 467 } 468 469 // Add the factory for unassociated endpoints, if present. 470 s.unassociatedFactory = opts.UnassociatedFactory 471 472 // Create the global transport demuxer. 473 s.demux = newTransportDemuxer(s) 474 475 return s 476 } 477 478 // SetNetworkProtocolOption allows configuring individual protocol level 479 // options. This method returns an error if the protocol is not supported or 480 // option is not supported by the protocol implementation or the provided value 481 // is incorrect. 482 func (s *Stack) SetNetworkProtocolOption(network tcpip.NetworkProtocolNumber, option interface{}) *tcpip.Error { 483 netProto, ok := s.networkProtocols[network] 484 if !ok { 485 return tcpip.ErrUnknownProtocol 486 } 487 return netProto.SetOption(option) 488 } 489 490 // NetworkProtocolOption allows retrieving individual protocol level option 491 // values. This method returns an error if the protocol is not supported or 492 // option is not supported by the protocol implementation. 493 // e.g. 494 // var v ipv4.MyOption 495 // err := s.NetworkProtocolOption(tcpip.IPv4ProtocolNumber, &v) 496 // if err != nil { 497 // ... 498 // } 499 func (s *Stack) NetworkProtocolOption(network tcpip.NetworkProtocolNumber, option interface{}) *tcpip.Error { 500 netProto, ok := s.networkProtocols[network] 501 if !ok { 502 return tcpip.ErrUnknownProtocol 503 } 504 return netProto.Option(option) 505 } 506 507 // SetTransportProtocolOption allows configuring individual protocol level 508 // options. This method returns an error if the protocol is not supported or 509 // option is not supported by the protocol implementation or the provided value 510 // is incorrect. 511 func (s *Stack) SetTransportProtocolOption(transport tcpip.TransportProtocolNumber, option interface{}) *tcpip.Error { 512 transProtoState, ok := s.transportProtocols[transport] 513 if !ok { 514 return tcpip.ErrUnknownProtocol 515 } 516 return transProtoState.proto.SetOption(option) 517 } 518 519 // TransportProtocolOption allows retrieving individual protocol level option 520 // values. This method returns an error if the protocol is not supported or 521 // option is not supported by the protocol implementation. 522 // var v tcp.SACKEnabled 523 // if err := s.TransportProtocolOption(tcpip.TCPProtocolNumber, &v); err != nil { 524 // ... 525 // } 526 func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber, option interface{}) *tcpip.Error { 527 transProtoState, ok := s.transportProtocols[transport] 528 if !ok { 529 return tcpip.ErrUnknownProtocol 530 } 531 return transProtoState.proto.Option(option) 532 } 533 534 // SetTransportProtocolHandler sets the per-stack default handler for the given 535 // protocol. 536 // 537 // It must be called only during initialization of the stack. Changing it as the 538 // stack is operating is not supported. 539 func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, buffer.View, buffer.VectorisedView) bool) { 540 state := s.transportProtocols[p] 541 if state != nil { 542 state.defaultHandler = h 543 } 544 } 545 546 // NowNanoseconds implements tcpip.Clock.NowNanoseconds. 547 func (s *Stack) NowNanoseconds() int64 { 548 return s.clock.NowNanoseconds() 549 } 550 551 // Stats returns a mutable copy of the current stats. 552 // 553 // This is not generally exported via the public interface, but is available 554 // internally. 555 func (s *Stack) Stats() tcpip.Stats { 556 return s.stats 557 } 558 559 // SetForwarding enables or disables the packet forwarding between NICs. 560 func (s *Stack) SetForwarding(enable bool) { 561 // TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward. 562 s.mu.Lock() 563 s.forwarding = enable 564 s.mu.Unlock() 565 } 566 567 // Forwarding returns if the packet forwarding between NICs is enabled. 568 func (s *Stack) Forwarding() bool { 569 // TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward. 570 s.mu.RLock() 571 defer s.mu.RUnlock() 572 return s.forwarding 573 } 574 575 // SetRouteTable assigns the route table to be used by this stack. It 576 // specifies which NIC to use for given destination address ranges. 577 func (s *Stack) SetRouteTable(table []tcpip.Route) { 578 s.mu.Lock() 579 defer s.mu.Unlock() 580 581 s.routeTable = table 582 } 583 584 // GetRouteTable returns the route table which is currently in use. 585 func (s *Stack) GetRouteTable() []tcpip.Route { 586 s.mu.Lock() 587 defer s.mu.Unlock() 588 return append([]tcpip.Route(nil), s.routeTable...) 589 } 590 591 // NewEndpoint creates a new transport layer endpoint of the given protocol. 592 func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { 593 t, ok := s.transportProtocols[transport] 594 if !ok { 595 return nil, tcpip.ErrUnknownProtocol 596 } 597 598 return t.proto.NewEndpoint(s, network, waiterQueue) 599 } 600 601 // NewRawEndpoint creates a new raw transport layer endpoint of the given 602 // protocol. Raw endpoints receive all traffic for a given protocol regardless 603 // of address. 604 func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, *tcpip.Error) { 605 if s.unassociatedFactory == nil { 606 return nil, tcpip.ErrNotPermitted 607 } 608 609 if !associated { 610 return s.unassociatedFactory.NewUnassociatedRawEndpoint(s, network, transport, waiterQueue) 611 } 612 613 t, ok := s.transportProtocols[transport] 614 if !ok { 615 return nil, tcpip.ErrUnknownProtocol 616 } 617 618 return t.proto.NewRawEndpoint(s, network, waiterQueue) 619 } 620 621 // createNIC creates a NIC with the provided id and link-layer endpoint, and 622 // optionally enable it. 623 func (s *Stack) createNIC(id tcpip.NICID, name string, ep LinkEndpoint, enabled, loopback bool) *tcpip.Error { 624 s.mu.Lock() 625 defer s.mu.Unlock() 626 627 // Make sure id is unique. 628 if _, ok := s.nics[id]; ok { 629 return tcpip.ErrDuplicateNICID 630 } 631 632 n := newNIC(s, id, name, ep, loopback) 633 634 s.nics[id] = n 635 if enabled { 636 return n.enable() 637 } 638 639 return nil 640 } 641 642 // CreateNIC creates a NIC with the provided id and link-layer endpoint. 643 func (s *Stack) CreateNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error { 644 return s.createNIC(id, "", ep, true, false) 645 } 646 647 // CreateNamedNIC creates a NIC with the provided id and link-layer endpoint, 648 // and a human-readable name. 649 func (s *Stack) CreateNamedNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error { 650 return s.createNIC(id, name, ep, true, false) 651 } 652 653 // CreateNamedLoopbackNIC creates a NIC with the provided id and link-layer 654 // endpoint, and a human-readable name. 655 func (s *Stack) CreateNamedLoopbackNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error { 656 return s.createNIC(id, name, ep, true, true) 657 } 658 659 // CreateDisabledNIC creates a NIC with the provided id and link-layer endpoint, 660 // but leave it disable. Stack.EnableNIC must be called before the link-layer 661 // endpoint starts delivering packets to it. 662 func (s *Stack) CreateDisabledNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error { 663 return s.createNIC(id, "", ep, false, false) 664 } 665 666 // CreateDisabledNamedNIC is a combination of CreateNamedNIC and 667 // CreateDisabledNIC. 668 func (s *Stack) CreateDisabledNamedNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error { 669 return s.createNIC(id, name, ep, false, false) 670 } 671 672 // EnableNIC enables the given NIC so that the link-layer endpoint can start 673 // delivering packets to it. 674 func (s *Stack) EnableNIC(id tcpip.NICID) *tcpip.Error { 675 s.mu.RLock() 676 defer s.mu.RUnlock() 677 678 nic := s.nics[id] 679 if nic == nil { 680 return tcpip.ErrUnknownNICID 681 } 682 683 return nic.enable() 684 } 685 686 // CheckNIC checks if a NIC is usable. 687 func (s *Stack) CheckNIC(id tcpip.NICID) bool { 688 s.mu.RLock() 689 nic, ok := s.nics[id] 690 s.mu.RUnlock() 691 if ok { 692 return nic.linkEP.IsAttached() 693 } 694 return false 695 } 696 697 // NICSubnets returns a map of NICIDs to their associated subnets. 698 func (s *Stack) NICAddressRanges() map[tcpip.NICID][]tcpip.Subnet { 699 s.mu.RLock() 700 defer s.mu.RUnlock() 701 702 nics := map[tcpip.NICID][]tcpip.Subnet{} 703 704 for id, nic := range s.nics { 705 nics[id] = append(nics[id], nic.AddressRanges()...) 706 } 707 return nics 708 } 709 710 // NICInfo captures the name and addresses assigned to a NIC. 711 type NICInfo struct { 712 Name string 713 LinkAddress tcpip.LinkAddress 714 ProtocolAddresses []tcpip.ProtocolAddress 715 716 // Flags indicate the state of the NIC. 717 Flags NICStateFlags 718 719 // MTU is the maximum transmission unit. 720 MTU uint32 721 722 Stats NICStats 723 } 724 725 // NICInfo returns a map of NICIDs to their associated information. 726 func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo { 727 s.mu.RLock() 728 defer s.mu.RUnlock() 729 730 nics := make(map[tcpip.NICID]NICInfo) 731 for id, nic := range s.nics { 732 flags := NICStateFlags{ 733 Up: true, // Netstack interfaces are always up. 734 Running: nic.linkEP.IsAttached(), 735 Promiscuous: nic.isPromiscuousMode(), 736 Loopback: nic.linkEP.Capabilities()&CapabilityLoopback != 0, 737 } 738 nics[id] = NICInfo{ 739 Name: nic.name, 740 LinkAddress: nic.linkEP.LinkAddress(), 741 ProtocolAddresses: nic.PrimaryAddresses(), 742 Flags: flags, 743 MTU: nic.linkEP.MTU(), 744 Stats: nic.stats, 745 } 746 } 747 return nics 748 } 749 750 // NICStateFlags holds information about the state of an NIC. 751 type NICStateFlags struct { 752 // Up indicates whether the interface is running. 753 Up bool 754 755 // Running indicates whether resources are allocated. 756 Running bool 757 758 // Promiscuous indicates whether the interface is in promiscuous mode. 759 Promiscuous bool 760 761 // Loopback indicates whether the interface is a loopback. 762 Loopback bool 763 } 764 765 // AddAddress adds a new network-layer address to the specified NIC. 766 func (s *Stack) AddAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error { 767 return s.AddAddressWithOptions(id, protocol, addr, CanBePrimaryEndpoint) 768 } 769 770 // AddProtocolAddress adds a new network-layer protocol address to the 771 // specified NIC. 772 func (s *Stack) AddProtocolAddress(id tcpip.NICID, protocolAddress tcpip.ProtocolAddress) *tcpip.Error { 773 return s.AddProtocolAddressWithOptions(id, protocolAddress, CanBePrimaryEndpoint) 774 } 775 776 // AddAddressWithOptions is the same as AddAddress, but allows you to specify 777 // whether the new endpoint can be primary or not. 778 func (s *Stack) AddAddressWithOptions(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, peb PrimaryEndpointBehavior) *tcpip.Error { 779 netProto, ok := s.networkProtocols[protocol] 780 if !ok { 781 return tcpip.ErrUnknownProtocol 782 } 783 return s.AddProtocolAddressWithOptions(id, tcpip.ProtocolAddress{ 784 Protocol: protocol, 785 AddressWithPrefix: tcpip.AddressWithPrefix{ 786 Address: addr, 787 PrefixLen: netProto.DefaultPrefixLen(), 788 }, 789 }, peb) 790 } 791 792 // AddProtocolAddressWithOptions is the same as AddProtocolAddress, but allows 793 // you to specify whether the new endpoint can be primary or not. 794 func (s *Stack) AddProtocolAddressWithOptions(id tcpip.NICID, protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior) *tcpip.Error { 795 s.mu.RLock() 796 defer s.mu.RUnlock() 797 798 nic := s.nics[id] 799 if nic == nil { 800 return tcpip.ErrUnknownNICID 801 } 802 803 return nic.AddAddress(protocolAddress, peb) 804 } 805 806 // AddAddressRange adds a range of addresses to the specified NIC. The range is 807 // given by a subnet address, and all addresses contained in the subnet are 808 // used except for the subnet address itself and the subnet's broadcast 809 // address. 810 func (s *Stack) AddAddressRange(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, subnet tcpip.Subnet) *tcpip.Error { 811 s.mu.RLock() 812 defer s.mu.RUnlock() 813 814 if nic, ok := s.nics[id]; ok { 815 nic.AddAddressRange(protocol, subnet) 816 return nil 817 } 818 819 return tcpip.ErrUnknownNICID 820 } 821 822 // RemoveAddressRange removes the range of addresses from the specified NIC. 823 func (s *Stack) RemoveAddressRange(id tcpip.NICID, subnet tcpip.Subnet) *tcpip.Error { 824 s.mu.RLock() 825 defer s.mu.RUnlock() 826 827 if nic, ok := s.nics[id]; ok { 828 nic.RemoveAddressRange(subnet) 829 return nil 830 } 831 832 return tcpip.ErrUnknownNICID 833 } 834 835 // RemoveAddress removes an existing network-layer address from the specified 836 // NIC. 837 func (s *Stack) RemoveAddress(id tcpip.NICID, addr tcpip.Address) *tcpip.Error { 838 s.mu.RLock() 839 defer s.mu.RUnlock() 840 841 if nic, ok := s.nics[id]; ok { 842 return nic.RemoveAddress(addr) 843 } 844 845 return tcpip.ErrUnknownNICID 846 } 847 848 // AllAddresses returns a map of NICIDs to their protocol addresses (primary 849 // and non-primary). 850 func (s *Stack) AllAddresses() map[tcpip.NICID][]tcpip.ProtocolAddress { 851 s.mu.RLock() 852 defer s.mu.RUnlock() 853 854 nics := make(map[tcpip.NICID][]tcpip.ProtocolAddress) 855 for id, nic := range s.nics { 856 nics[id] = nic.AllAddresses() 857 } 858 return nics 859 } 860 861 // GetMainNICAddress returns the first primary address and prefix for the given 862 // NIC and protocol. Returns an error if the NIC doesn't exist and an empty 863 // value if the NIC doesn't have a primary address for the given protocol. 864 func (s *Stack) GetMainNICAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber) (tcpip.AddressWithPrefix, *tcpip.Error) { 865 s.mu.RLock() 866 defer s.mu.RUnlock() 867 868 nic, ok := s.nics[id] 869 if !ok { 870 return tcpip.AddressWithPrefix{}, tcpip.ErrUnknownNICID 871 } 872 873 for _, a := range nic.PrimaryAddresses() { 874 if a.Protocol == protocol { 875 return a.AddressWithPrefix, nil 876 } 877 } 878 return tcpip.AddressWithPrefix{}, nil 879 } 880 881 func (s *Stack) getRefEP(nic *NIC, localAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) (ref *referencedNetworkEndpoint) { 882 if len(localAddr) == 0 { 883 return nic.primaryEndpoint(netProto) 884 } 885 return nic.findEndpoint(netProto, localAddr, CanBePrimaryEndpoint) 886 } 887 888 // FindRoute creates a route to the given destination address, leaving through 889 // the given nic and local address (if provided). 890 func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber, multicastLoop bool) (Route, *tcpip.Error) { 891 s.mu.RLock() 892 defer s.mu.RUnlock() 893 894 isBroadcast := remoteAddr == header.IPv4Broadcast 895 isMulticast := header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr) 896 needRoute := !(isBroadcast || isMulticast || header.IsV6LinkLocalAddress(remoteAddr)) 897 if id != 0 && !needRoute { 898 if nic, ok := s.nics[id]; ok { 899 if ref := s.getRefEP(nic, localAddr, netProto); ref != nil { 900 return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.loopback, multicastLoop && !nic.loopback), nil 901 } 902 } 903 } else { 904 for _, route := range s.routeTable { 905 if (id != 0 && id != route.NIC) || (len(remoteAddr) != 0 && !route.Destination.Contains(remoteAddr)) { 906 continue 907 } 908 if nic, ok := s.nics[route.NIC]; ok { 909 if ref := s.getRefEP(nic, localAddr, netProto); ref != nil { 910 if len(remoteAddr) == 0 { 911 // If no remote address was provided, then the route 912 // provided will refer to the link local address. 913 remoteAddr = ref.ep.ID().LocalAddress 914 } 915 916 r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.loopback, multicastLoop && !nic.loopback) 917 if needRoute { 918 r.NextHop = route.Gateway 919 } 920 return r, nil 921 } 922 } 923 } 924 } 925 926 if !needRoute { 927 return Route{}, tcpip.ErrNetworkUnreachable 928 } 929 930 return Route{}, tcpip.ErrNoRoute 931 } 932 933 // CheckNetworkProtocol checks if a given network protocol is enabled in the 934 // stack. 935 func (s *Stack) CheckNetworkProtocol(protocol tcpip.NetworkProtocolNumber) bool { 936 _, ok := s.networkProtocols[protocol] 937 return ok 938 } 939 940 // CheckLocalAddress determines if the given local address exists, and if it 941 // does, returns the id of the NIC it's bound to. Returns 0 if the address 942 // does not exist. 943 func (s *Stack) CheckLocalAddress(nicid tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.NICID { 944 s.mu.RLock() 945 defer s.mu.RUnlock() 946 947 // If a NIC is specified, we try to find the address there only. 948 if nicid != 0 { 949 nic := s.nics[nicid] 950 if nic == nil { 951 return 0 952 } 953 954 ref := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint) 955 if ref == nil { 956 return 0 957 } 958 959 ref.decRef() 960 961 return nic.id 962 } 963 964 // Go through all the NICs. 965 for _, nic := range s.nics { 966 ref := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint) 967 if ref != nil { 968 ref.decRef() 969 return nic.id 970 } 971 } 972 973 return 0 974 } 975 976 // SetPromiscuousMode enables or disables promiscuous mode in the given NIC. 977 func (s *Stack) SetPromiscuousMode(nicID tcpip.NICID, enable bool) *tcpip.Error { 978 s.mu.RLock() 979 defer s.mu.RUnlock() 980 981 nic := s.nics[nicID] 982 if nic == nil { 983 return tcpip.ErrUnknownNICID 984 } 985 986 nic.setPromiscuousMode(enable) 987 988 return nil 989 } 990 991 // SetSpoofing enables or disables address spoofing in the given NIC, allowing 992 // endpoints to bind to any address in the NIC. 993 func (s *Stack) SetSpoofing(nicID tcpip.NICID, enable bool) *tcpip.Error { 994 s.mu.RLock() 995 defer s.mu.RUnlock() 996 997 nic := s.nics[nicID] 998 if nic == nil { 999 return tcpip.ErrUnknownNICID 1000 } 1001 1002 nic.setSpoofing(enable) 1003 1004 return nil 1005 } 1006 1007 // AddLinkAddress adds a link address to the stack link cache. 1008 func (s *Stack) AddLinkAddress(nicid tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress) { 1009 fullAddr := tcpip.FullAddress{NIC: nicid, Addr: addr} 1010 s.linkAddrCache.add(fullAddr, linkAddr) 1011 // TODO: provide a way for a transport endpoint to receive a signal 1012 // that AddLinkAddress for a particular address has been called. 1013 } 1014 1015 // GetLinkAddress implements LinkAddressCache.GetLinkAddress. 1016 func (s *Stack) GetLinkAddress(nicid tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, waker *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) { 1017 s.mu.RLock() 1018 nic := s.nics[nicid] 1019 if nic == nil { 1020 s.mu.RUnlock() 1021 return "", nil, tcpip.ErrUnknownNICID 1022 } 1023 s.mu.RUnlock() 1024 1025 fullAddr := tcpip.FullAddress{NIC: nicid, Addr: addr} 1026 linkRes := s.linkAddrResolvers[protocol] 1027 return s.linkAddrCache.get(fullAddr, linkRes, localAddr, nic.linkEP, waker) 1028 } 1029 1030 // RemoveWaker implements LinkAddressCache.RemoveWaker. 1031 func (s *Stack) RemoveWaker(nicid tcpip.NICID, addr tcpip.Address, waker *sleep.Waker) { 1032 s.mu.RLock() 1033 defer s.mu.RUnlock() 1034 1035 if nic := s.nics[nicid]; nic == nil { 1036 fullAddr := tcpip.FullAddress{NIC: nicid, Addr: addr} 1037 s.linkAddrCache.removeWaker(fullAddr, waker) 1038 } 1039 } 1040 1041 // RegisterTransportEndpoint registers the given endpoint with the stack 1042 // transport dispatcher. Received packets that match the provided id will be 1043 // delivered to the given endpoint; specifying a nic is optional, but 1044 // nic-specific IDs have precedence over global ones. 1045 func (s *Stack) RegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error { 1046 return s.demux.registerEndpoint(netProtos, protocol, id, ep, reusePort, bindToDevice) 1047 } 1048 1049 // UnregisterTransportEndpoint removes the endpoint with the given id from the 1050 // stack transport dispatcher. 1051 func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) { 1052 s.demux.unregisterEndpoint(netProtos, protocol, id, ep, bindToDevice) 1053 } 1054 1055 // RegisterRawTransportEndpoint registers the given endpoint with the stack 1056 // transport dispatcher. Received packets that match the provided transport 1057 // protocol will be delivered to the given endpoint. 1058 func (s *Stack) RegisterRawTransportEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) *tcpip.Error { 1059 return s.demux.registerRawEndpoint(netProto, transProto, ep) 1060 } 1061 1062 // UnregisterRawTransportEndpoint removes the endpoint for the transport 1063 // protocol from the stack transport dispatcher. 1064 func (s *Stack) UnregisterRawTransportEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) { 1065 s.demux.unregisterRawEndpoint(netProto, transProto, ep) 1066 } 1067 1068 // RegisterRestoredEndpoint records e as an endpoint that has been restored on 1069 // this stack. 1070 func (s *Stack) RegisterRestoredEndpoint(e ResumableEndpoint) { 1071 s.mu.Lock() 1072 s.resumableEndpoints = append(s.resumableEndpoints, e) 1073 s.mu.Unlock() 1074 } 1075 1076 // Resume restarts the stack after a restore. This must be called after the 1077 // entire system has been restored. 1078 func (s *Stack) Resume() { 1079 // ResumableEndpoint.Resume() may call other methods on s, so we can't hold 1080 // s.mu while resuming the endpoints. 1081 s.mu.Lock() 1082 eps := s.resumableEndpoints 1083 s.resumableEndpoints = nil 1084 s.mu.Unlock() 1085 for _, e := range eps { 1086 e.Resume(s) 1087 } 1088 } 1089 1090 // NetworkProtocolInstance returns the protocol instance in the stack for the 1091 // specified network protocol. This method is public for protocol implementers 1092 // and tests to use. 1093 func (s *Stack) NetworkProtocolInstance(num tcpip.NetworkProtocolNumber) NetworkProtocol { 1094 if p, ok := s.networkProtocols[num]; ok { 1095 return p 1096 } 1097 return nil 1098 } 1099 1100 // TransportProtocolInstance returns the protocol instance in the stack for the 1101 // specified transport protocol. This method is public for protocol implementers 1102 // and tests to use. 1103 func (s *Stack) TransportProtocolInstance(num tcpip.TransportProtocolNumber) TransportProtocol { 1104 if pState, ok := s.transportProtocols[num]; ok { 1105 return pState.proto 1106 } 1107 return nil 1108 } 1109 1110 // AddTCPProbe installs a probe function that will be invoked on every segment 1111 // received by a given TCP endpoint. The probe function is passed a copy of the 1112 // TCP endpoint state before and after processing of the segment. 1113 // 1114 // NOTE: TCPProbe is added only to endpoints created after this call. Endpoints 1115 // created prior to this call will not call the probe function. 1116 // 1117 // Further, installing two different probes back to back can result in some 1118 // endpoints calling the first one and some the second one. There is no 1119 // guarantee provided on which probe will be invoked. Ideally this should only 1120 // be called once per stack. 1121 func (s *Stack) AddTCPProbe(probe TCPProbeFunc) { 1122 s.mu.Lock() 1123 s.tcpProbeFunc = probe 1124 s.mu.Unlock() 1125 } 1126 1127 // GetTCPProbe returns the TCPProbeFunc if installed with AddTCPProbe, nil 1128 // otherwise. 1129 func (s *Stack) GetTCPProbe() TCPProbeFunc { 1130 s.mu.Lock() 1131 p := s.tcpProbeFunc 1132 s.mu.Unlock() 1133 return p 1134 } 1135 1136 // RemoveTCPProbe removes an installed TCP probe. 1137 // 1138 // NOTE: This only ensures that endpoints created after this call do not 1139 // have a probe attached. Endpoints already created will continue to invoke 1140 // TCP probe. 1141 func (s *Stack) RemoveTCPProbe() { 1142 s.mu.Lock() 1143 s.tcpProbeFunc = nil 1144 s.mu.Unlock() 1145 } 1146 1147 // JoinGroup joins the given multicast group on the given NIC. 1148 func (s *Stack) JoinGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NICID, multicastAddr tcpip.Address) *tcpip.Error { 1149 // TODO: notify network of subscription via igmp protocol. 1150 s.mu.RLock() 1151 defer s.mu.RUnlock() 1152 1153 if nic, ok := s.nics[nicID]; ok { 1154 return nic.joinGroup(protocol, multicastAddr) 1155 } 1156 return tcpip.ErrUnknownNICID 1157 } 1158 1159 // LeaveGroup leaves the given multicast group on the given NIC. 1160 func (s *Stack) LeaveGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NICID, multicastAddr tcpip.Address) *tcpip.Error { 1161 s.mu.RLock() 1162 defer s.mu.RUnlock() 1163 1164 if nic, ok := s.nics[nicID]; ok { 1165 return nic.leaveGroup(multicastAddr) 1166 } 1167 return tcpip.ErrUnknownNICID 1168 } 1169 1170 // IPTables returns the stack's iptables. 1171 func (s *Stack) IPTables() iptables.IPTables { 1172 return s.tables 1173 } 1174 1175 // SetIPTables sets the stack's iptables. 1176 func (s *Stack) SetIPTables(ipt iptables.IPTables) { 1177 s.tables = ipt 1178 } 1179 1180 // ICMPLimit returns the maximum number of ICMP messages that can be sent 1181 // in one second. 1182 func (s *Stack) ICMPLimit() rate.Limit { 1183 return s.icmpRateLimiter.Limit() 1184 } 1185 1186 // SetICMPLimit sets the maximum number of ICMP messages that be sent 1187 // in one second. 1188 func (s *Stack) SetICMPLimit(newLimit rate.Limit) { 1189 s.icmpRateLimiter.SetLimit(newLimit) 1190 } 1191 1192 // ICMPBurst returns the maximum number of ICMP messages that can be sent 1193 // in a single burst. 1194 func (s *Stack) ICMPBurst() int { 1195 return s.icmpRateLimiter.Burst() 1196 } 1197 1198 // SetICMPBurst sets the maximum number of ICMP messages that can be sent 1199 // in a single burst. 1200 func (s *Stack) SetICMPBurst(burst int) { 1201 s.icmpRateLimiter.SetBurst(burst) 1202 } 1203 1204 // AllowICMPMessage returns true if we the rate limiter allows at least one 1205 // ICMP message to be sent at this instant. 1206 func (s *Stack) AllowICMPMessage() bool { 1207 return s.icmpRateLimiter.Allow() 1208 } 1209 1210 // PortSeed returns a 32 bit value that can be used as a seed value for port 1211 // picking. 1212 // 1213 // NOTE: The seed is generated once during stack initialization only. 1214 func (s *Stack) PortSeed() uint32 { 1215 return s.portSeed 1216 } 1217 1218 func generateRandUint32() uint32 { 1219 b := make([]byte, 4) 1220 if _, err := rand.Read(b); err != nil { 1221 panic(err) 1222 } 1223 return binary.LittleEndian.Uint32(b) 1224 }