github.com/google/netstack@v0.0.0-20191123085552-55fcc16cd0eb/tcpip/stack/stack.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package stack provides the glue between networking protocols and the 16 // consumers of the networking stack. 17 // 18 // For consumers, the only function of interest is New(), everything else is 19 // provided by the tcpip/public package. 20 package stack 21 22 import ( 23 "encoding/binary" 24 "sync" 25 "sync/atomic" 26 "time" 27 28 "github.com/google/netstack/rand" 29 "github.com/google/netstack/sleep" 30 "github.com/google/netstack/tcpip" 31 "github.com/google/netstack/tcpip/buffer" 32 "github.com/google/netstack/tcpip/header" 33 "github.com/google/netstack/tcpip/iptables" 34 "github.com/google/netstack/tcpip/ports" 35 "github.com/google/netstack/tcpip/seqnum" 36 "github.com/google/netstack/waiter" 37 "golang.org/x/time/rate" 38 ) 39 40 const ( 41 // ageLimit is set to the same cache stale time used in Linux. 42 ageLimit = 1 * time.Minute 43 // resolutionTimeout is set to the same ARP timeout used in Linux. 44 resolutionTimeout = 1 * time.Second 45 // resolutionAttempts is set to the same ARP retries used in Linux. 46 resolutionAttempts = 3 47 48 // DefaultTOS is the default type of service value for network endpoints. 49 DefaultTOS = 0 50 ) 51 52 type transportProtocolState struct { 53 proto TransportProtocol 54 defaultHandler func(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) bool 55 } 56 57 // TCPProbeFunc is the expected function type for a TCP probe function to be 58 // passed to stack.AddTCPProbe. 59 type TCPProbeFunc func(s TCPEndpointState) 60 61 // TCPCubicState is used to hold a copy of the internal cubic state when the 62 // TCPProbeFunc is invoked. 63 type TCPCubicState struct { 64 WLastMax float64 65 WMax float64 66 T time.Time 67 TimeSinceLastCongestion time.Duration 68 C float64 69 K float64 70 Beta float64 71 WC float64 72 WEst float64 73 } 74 75 // TCPEndpointID is the unique 4 tuple that identifies a given endpoint. 76 type TCPEndpointID struct { 77 // LocalPort is the local port associated with the endpoint. 78 LocalPort uint16 79 80 // LocalAddress is the local [network layer] address associated with 81 // the endpoint. 82 LocalAddress tcpip.Address 83 84 // RemotePort is the remote port associated with the endpoint. 85 RemotePort uint16 86 87 // RemoteAddress it the remote [network layer] address associated with 88 // the endpoint. 89 RemoteAddress tcpip.Address 90 } 91 92 // TCPFastRecoveryState holds a copy of the internal fast recovery state of a 93 // TCP endpoint. 94 type TCPFastRecoveryState struct { 95 // Active if true indicates the endpoint is in fast recovery. 96 Active bool 97 98 // First is the first unacknowledged sequence number being recovered. 99 First seqnum.Value 100 101 // Last is the 'recover' sequence number that indicates the point at 102 // which we should exit recovery barring any timeouts etc. 103 Last seqnum.Value 104 105 // MaxCwnd is the maximum value we are permitted to grow the congestion 106 // window during recovery. This is set at the time we enter recovery. 107 MaxCwnd int 108 109 // HighRxt is the highest sequence number which has been retransmitted 110 // during the current loss recovery phase. 111 // See: RFC 6675 Section 2 for details. 112 HighRxt seqnum.Value 113 114 // RescueRxt is the highest sequence number which has been 115 // optimistically retransmitted to prevent stalling of the ACK clock 116 // when there is loss at the end of the window and no new data is 117 // available for transmission. 118 // See: RFC 6675 Section 2 for details. 119 RescueRxt seqnum.Value 120 } 121 122 // TCPReceiverState holds a copy of the internal state of the receiver for 123 // a given TCP endpoint. 124 type TCPReceiverState struct { 125 // RcvNxt is the TCP variable RCV.NXT. 126 RcvNxt seqnum.Value 127 128 // RcvAcc is the TCP variable RCV.ACC. 129 RcvAcc seqnum.Value 130 131 // RcvWndScale is the window scaling to use for inbound segments. 132 RcvWndScale uint8 133 134 // PendingBufUsed is the number of bytes pending in the receive 135 // queue. 136 PendingBufUsed seqnum.Size 137 138 // PendingBufSize is the size of the socket receive buffer. 139 PendingBufSize seqnum.Size 140 } 141 142 // TCPSenderState holds a copy of the internal state of the sender for 143 // a given TCP Endpoint. 144 type TCPSenderState struct { 145 // LastSendTime is the time at which we sent the last segment. 146 LastSendTime time.Time 147 148 // DupAckCount is the number of Duplicate ACK's received. 149 DupAckCount int 150 151 // SndCwnd is the size of the sending congestion window in packets. 152 SndCwnd int 153 154 // Ssthresh is the slow start threshold in packets. 155 Ssthresh int 156 157 // SndCAAckCount is the number of packets consumed in congestion 158 // avoidance mode. 159 SndCAAckCount int 160 161 // Outstanding is the number of packets in flight. 162 Outstanding int 163 164 // SndWnd is the send window size in bytes. 165 SndWnd seqnum.Size 166 167 // SndUna is the next unacknowledged sequence number. 168 SndUna seqnum.Value 169 170 // SndNxt is the sequence number of the next segment to be sent. 171 SndNxt seqnum.Value 172 173 // RTTMeasureSeqNum is the sequence number being used for the latest RTT 174 // measurement. 175 RTTMeasureSeqNum seqnum.Value 176 177 // RTTMeasureTime is the time when the RTTMeasureSeqNum was sent. 178 RTTMeasureTime time.Time 179 180 // Closed indicates that the caller has closed the endpoint for sending. 181 Closed bool 182 183 // SRTT is the smoothed round-trip time as defined in section 2 of 184 // RFC 6298. 185 SRTT time.Duration 186 187 // RTO is the retransmit timeout as defined in section of 2 of RFC 6298. 188 RTO time.Duration 189 190 // RTTVar is the round-trip time variation as defined in section 2 of 191 // RFC 6298. 192 RTTVar time.Duration 193 194 // SRTTInited if true indicates take a valid RTT measurement has been 195 // completed. 196 SRTTInited bool 197 198 // MaxPayloadSize is the maximum size of the payload of a given segment. 199 // It is initialized on demand. 200 MaxPayloadSize int 201 202 // SndWndScale is the number of bits to shift left when reading the send 203 // window size from a segment. 204 SndWndScale uint8 205 206 // MaxSentAck is the highest acknowledgement number sent till now. 207 MaxSentAck seqnum.Value 208 209 // FastRecovery holds the fast recovery state for the endpoint. 210 FastRecovery TCPFastRecoveryState 211 212 // Cubic holds the state related to CUBIC congestion control. 213 Cubic TCPCubicState 214 } 215 216 // TCPSACKInfo holds TCP SACK related information for a given TCP endpoint. 217 type TCPSACKInfo struct { 218 // Blocks is the list of SACK Blocks that identify the out of order segments 219 // held by a given TCP endpoint. 220 Blocks []header.SACKBlock 221 222 // ReceivedBlocks are the SACK blocks received by this endpoint 223 // from the peer endpoint. 224 ReceivedBlocks []header.SACKBlock 225 226 // MaxSACKED is the highest sequence number that has been SACKED 227 // by the peer. 228 MaxSACKED seqnum.Value 229 } 230 231 // RcvBufAutoTuneParams holds state related to TCP receive buffer auto-tuning. 232 type RcvBufAutoTuneParams struct { 233 // MeasureTime is the time at which the current measurement 234 // was started. 235 MeasureTime time.Time 236 237 // CopiedBytes is the number of bytes copied to user space since 238 // this measure began. 239 CopiedBytes int 240 241 // PrevCopiedBytes is the number of bytes copied to user space in 242 // the previous RTT period. 243 PrevCopiedBytes int 244 245 // RcvBufSize is the auto tuned receive buffer size. 246 RcvBufSize int 247 248 // RTT is the smoothed RTT as measured by observing the time between 249 // when a byte is first acknowledged and the receipt of data that is at 250 // least one window beyond the sequence number that was acknowledged. 251 RTT time.Duration 252 253 // RTTVar is the "round-trip time variation" as defined in section 2 254 // of RFC6298. 255 RTTVar time.Duration 256 257 // RTTMeasureSeqNumber is the highest acceptable sequence number at the 258 // time this RTT measurement period began. 259 RTTMeasureSeqNumber seqnum.Value 260 261 // RTTMeasureTime is the absolute time at which the current RTT 262 // measurement period began. 263 RTTMeasureTime time.Time 264 265 // Disabled is true if an explicit receive buffer is set for the 266 // endpoint. 267 Disabled bool 268 } 269 270 // TCPEndpointState is a copy of the internal state of a TCP endpoint. 271 type TCPEndpointState struct { 272 // ID is a copy of the TransportEndpointID for the endpoint. 273 ID TCPEndpointID 274 275 // SegTime denotes the absolute time when this segment was received. 276 SegTime time.Time 277 278 // RcvBufSize is the size of the receive socket buffer for the endpoint. 279 RcvBufSize int 280 281 // RcvBufUsed is the amount of bytes actually held in the receive socket 282 // buffer for the endpoint. 283 RcvBufUsed int 284 285 // RcvBufAutoTuneParams is used to hold state variables to compute 286 // the auto tuned receive buffer size. 287 RcvAutoParams RcvBufAutoTuneParams 288 289 // RcvClosed if true, indicates the endpoint has been closed for reading. 290 RcvClosed bool 291 292 // SendTSOk is used to indicate when the TS Option has been negotiated. 293 // When sendTSOk is true every non-RST segment should carry a TS as per 294 // RFC7323#section-1.1. 295 SendTSOk bool 296 297 // RecentTS is the timestamp that should be sent in the TSEcr field of 298 // the timestamp for future segments sent by the endpoint. This field is 299 // updated if required when a new segment is received by this endpoint. 300 RecentTS uint32 301 302 // TSOffset is a randomized offset added to the value of the TSVal field 303 // in the timestamp option. 304 TSOffset uint32 305 306 // SACKPermitted is set to true if the peer sends the TCPSACKPermitted 307 // option in the SYN/SYN-ACK. 308 SACKPermitted bool 309 310 // SACK holds TCP SACK related information for this endpoint. 311 SACK TCPSACKInfo 312 313 // SndBufSize is the size of the socket send buffer. 314 SndBufSize int 315 316 // SndBufUsed is the number of bytes held in the socket send buffer. 317 SndBufUsed int 318 319 // SndClosed indicates that the endpoint has been closed for sends. 320 SndClosed bool 321 322 // SndBufInQueue is the number of bytes in the send queue. 323 SndBufInQueue seqnum.Size 324 325 // PacketTooBigCount is used to notify the main protocol routine how 326 // many times a "packet too big" control packet is received. 327 PacketTooBigCount int 328 329 // SndMTU is the smallest MTU seen in the control packets received. 330 SndMTU int 331 332 // Receiver holds variables related to the TCP receiver for the endpoint. 333 Receiver TCPReceiverState 334 335 // Sender holds state related to the TCP Sender for the endpoint. 336 Sender TCPSenderState 337 } 338 339 // ResumableEndpoint is an endpoint that needs to be resumed after restore. 340 type ResumableEndpoint interface { 341 // Resume resumes an endpoint after restore. This can be used to restart 342 // background workers such as protocol goroutines. This must be called after 343 // all indirect dependencies of the endpoint has been restored, which 344 // generally implies at the end of the restore process. 345 Resume(*Stack) 346 } 347 348 // uniqueIDGenerator is a default unique ID generator. 349 type uniqueIDGenerator uint64 350 351 func (u *uniqueIDGenerator) UniqueID() uint64 { 352 return atomic.AddUint64((*uint64)(u), 1) 353 } 354 355 // Stack is a networking stack, with all supported protocols, NICs, and route 356 // table. 357 type Stack struct { 358 transportProtocols map[tcpip.TransportProtocolNumber]*transportProtocolState 359 networkProtocols map[tcpip.NetworkProtocolNumber]NetworkProtocol 360 linkAddrResolvers map[tcpip.NetworkProtocolNumber]LinkAddressResolver 361 362 // rawFactory creates raw endpoints. If nil, raw endpoints are 363 // disabled. It is set during Stack creation and is immutable. 364 rawFactory RawFactory 365 366 demux *transportDemuxer 367 368 stats tcpip.Stats 369 370 linkAddrCache *linkAddrCache 371 372 mu sync.RWMutex 373 nics map[tcpip.NICID]*NIC 374 forwarding bool 375 cleanupEndpoints map[TransportEndpoint]struct{} 376 377 // route is the route table passed in by the user via SetRouteTable(), 378 // it is used by FindRoute() to build a route for a specific 379 // destination. 380 routeTable []tcpip.Route 381 382 *ports.PortManager 383 384 // If not nil, then any new endpoints will have this probe function 385 // invoked everytime they receive a TCP segment. 386 tcpProbeFunc TCPProbeFunc 387 388 // clock is used to generate user-visible times. 389 clock tcpip.Clock 390 391 // handleLocal allows non-loopback interfaces to loop packets. 392 handleLocal bool 393 394 // tables are the iptables packet filtering and manipulation rules. 395 tables iptables.IPTables 396 397 // resumableEndpoints is a list of endpoints that need to be resumed if the 398 // stack is being restored. 399 resumableEndpoints []ResumableEndpoint 400 401 // icmpRateLimiter is a global rate limiter for all ICMP messages generated 402 // by the stack. 403 icmpRateLimiter *ICMPRateLimiter 404 405 // seed is a one-time random value initialized at stack startup 406 // and is used to seed the TCP port picking on active connections 407 // 408 // TODO(gvisor.dev/issue/940): S/R this field. 409 seed uint32 410 411 // ndpConfigs is the default NDP configurations used by interfaces. 412 ndpConfigs NDPConfigurations 413 414 // autoGenIPv6LinkLocal determines whether or not the stack will attempt 415 // to auto-generate an IPv6 link-local address for newly enabled NICs. 416 // See the AutoGenIPv6LinkLocal field of Options for more details. 417 autoGenIPv6LinkLocal bool 418 419 // ndpDisp is the NDP event dispatcher that is used to send the netstack 420 // integrator NDP related events. 421 ndpDisp NDPDispatcher 422 423 // uniqueIDGenerator is a generator of unique identifiers. 424 uniqueIDGenerator UniqueID 425 } 426 427 // UniqueID is an abstract generator of unique identifiers. 428 type UniqueID interface { 429 UniqueID() uint64 430 } 431 432 // Options contains optional Stack configuration. 433 type Options struct { 434 // NetworkProtocols lists the network protocols to enable. 435 NetworkProtocols []NetworkProtocol 436 437 // TransportProtocols lists the transport protocols to enable. 438 TransportProtocols []TransportProtocol 439 440 // Clock is an optional clock source used for timestampping packets. 441 // 442 // If no Clock is specified, the clock source will be time.Now. 443 Clock tcpip.Clock 444 445 // Stats are optional statistic counters. 446 Stats tcpip.Stats 447 448 // HandleLocal indicates whether packets destined to their source 449 // should be handled by the stack internally (true) or outside the 450 // stack (false). 451 HandleLocal bool 452 453 // UniqueID is an optional generator of unique identifiers. 454 UniqueID UniqueID 455 456 // NDPConfigs is the default NDP configurations used by interfaces. 457 // 458 // By default, NDPConfigs will have a zero value for its 459 // DupAddrDetectTransmits field, implying that DAD will not be performed 460 // before assigning an address to a NIC. 461 NDPConfigs NDPConfigurations 462 463 // AutoGenIPv6LinkLocal determins whether or not the stack will attempt 464 // to auto-generate an IPv6 link-local address for newly enabled NICs. 465 // Note, setting this to true does not mean that a link-local address 466 // will be assigned right away, or at all. If Duplicate Address 467 // Detection is enabled, an address will only be assigned if it 468 // successfully resolves. If it fails, no further attempt will be made 469 // to auto-generate an IPv6 link-local address. 470 // 471 // The generated link-local address will follow RFC 4291 Appendix A 472 // guidelines. 473 AutoGenIPv6LinkLocal bool 474 475 // NDPDisp is the NDP event dispatcher that an integrator can provide to 476 // receive NDP related events. 477 NDPDisp NDPDispatcher 478 479 // RawFactory produces raw endpoints. Raw endpoints are enabled only if 480 // this is non-nil. 481 RawFactory RawFactory 482 } 483 484 // TransportEndpointInfo holds useful information about a transport endpoint 485 // which can be queried by monitoring tools. 486 // 487 // +stateify savable 488 type TransportEndpointInfo struct { 489 // The following fields are initialized at creation time and are 490 // immutable. 491 492 NetProto tcpip.NetworkProtocolNumber 493 TransProto tcpip.TransportProtocolNumber 494 495 // The following fields are protected by endpoint mu. 496 497 ID TransportEndpointID 498 // BindNICID and bindAddr are set via calls to Bind(). They are used to 499 // reject attempts to send data or connect via a different NIC or 500 // address 501 BindNICID tcpip.NICID 502 BindAddr tcpip.Address 503 // RegisterNICID is the default NICID registered as a side-effect of 504 // connect or datagram write. 505 RegisterNICID tcpip.NICID 506 } 507 508 // IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo 509 // marker interface. 510 func (*TransportEndpointInfo) IsEndpointInfo() {} 511 512 // New allocates a new networking stack with only the requested networking and 513 // transport protocols configured with default options. 514 // 515 // Note, NDPConfigurations will be fixed before being used by the Stack. That 516 // is, if an invalid value was provided, it will be reset to the default value. 517 // 518 // Protocol options can be changed by calling the 519 // SetNetworkProtocolOption/SetTransportProtocolOption methods provided by the 520 // stack. Please refer to individual protocol implementations as to what options 521 // are supported. 522 func New(opts Options) *Stack { 523 clock := opts.Clock 524 if clock == nil { 525 clock = &tcpip.StdClock{} 526 } 527 528 if opts.UniqueID == nil { 529 opts.UniqueID = new(uniqueIDGenerator) 530 } 531 532 // Make sure opts.NDPConfigs contains valid values only. 533 opts.NDPConfigs.validate() 534 535 s := &Stack{ 536 transportProtocols: make(map[tcpip.TransportProtocolNumber]*transportProtocolState), 537 networkProtocols: make(map[tcpip.NetworkProtocolNumber]NetworkProtocol), 538 linkAddrResolvers: make(map[tcpip.NetworkProtocolNumber]LinkAddressResolver), 539 nics: make(map[tcpip.NICID]*NIC), 540 cleanupEndpoints: make(map[TransportEndpoint]struct{}), 541 linkAddrCache: newLinkAddrCache(ageLimit, resolutionTimeout, resolutionAttempts), 542 PortManager: ports.NewPortManager(), 543 clock: clock, 544 stats: opts.Stats.FillIn(), 545 handleLocal: opts.HandleLocal, 546 icmpRateLimiter: NewICMPRateLimiter(), 547 seed: generateRandUint32(), 548 ndpConfigs: opts.NDPConfigs, 549 autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal, 550 uniqueIDGenerator: opts.UniqueID, 551 ndpDisp: opts.NDPDisp, 552 } 553 554 // Add specified network protocols. 555 for _, netProto := range opts.NetworkProtocols { 556 s.networkProtocols[netProto.Number()] = netProto 557 if r, ok := netProto.(LinkAddressResolver); ok { 558 s.linkAddrResolvers[r.LinkAddressProtocol()] = r 559 } 560 } 561 562 // Add specified transport protocols. 563 for _, transProto := range opts.TransportProtocols { 564 s.transportProtocols[transProto.Number()] = &transportProtocolState{ 565 proto: transProto, 566 } 567 } 568 569 // Add the factory for raw endpoints, if present. 570 s.rawFactory = opts.RawFactory 571 572 // Create the global transport demuxer. 573 s.demux = newTransportDemuxer(s) 574 575 return s 576 } 577 578 // UniqueID returns a unique identifier. 579 func (s *Stack) UniqueID() uint64 { 580 return s.uniqueIDGenerator.UniqueID() 581 } 582 583 // SetNetworkProtocolOption allows configuring individual protocol level 584 // options. This method returns an error if the protocol is not supported or 585 // option is not supported by the protocol implementation or the provided value 586 // is incorrect. 587 func (s *Stack) SetNetworkProtocolOption(network tcpip.NetworkProtocolNumber, option interface{}) *tcpip.Error { 588 netProto, ok := s.networkProtocols[network] 589 if !ok { 590 return tcpip.ErrUnknownProtocol 591 } 592 return netProto.SetOption(option) 593 } 594 595 // NetworkProtocolOption allows retrieving individual protocol level option 596 // values. This method returns an error if the protocol is not supported or 597 // option is not supported by the protocol implementation. 598 // e.g. 599 // var v ipv4.MyOption 600 // err := s.NetworkProtocolOption(tcpip.IPv4ProtocolNumber, &v) 601 // if err != nil { 602 // ... 603 // } 604 func (s *Stack) NetworkProtocolOption(network tcpip.NetworkProtocolNumber, option interface{}) *tcpip.Error { 605 netProto, ok := s.networkProtocols[network] 606 if !ok { 607 return tcpip.ErrUnknownProtocol 608 } 609 return netProto.Option(option) 610 } 611 612 // SetTransportProtocolOption allows configuring individual protocol level 613 // options. This method returns an error if the protocol is not supported or 614 // option is not supported by the protocol implementation or the provided value 615 // is incorrect. 616 func (s *Stack) SetTransportProtocolOption(transport tcpip.TransportProtocolNumber, option interface{}) *tcpip.Error { 617 transProtoState, ok := s.transportProtocols[transport] 618 if !ok { 619 return tcpip.ErrUnknownProtocol 620 } 621 return transProtoState.proto.SetOption(option) 622 } 623 624 // TransportProtocolOption allows retrieving individual protocol level option 625 // values. This method returns an error if the protocol is not supported or 626 // option is not supported by the protocol implementation. 627 // var v tcp.SACKEnabled 628 // if err := s.TransportProtocolOption(tcpip.TCPProtocolNumber, &v); err != nil { 629 // ... 630 // } 631 func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber, option interface{}) *tcpip.Error { 632 transProtoState, ok := s.transportProtocols[transport] 633 if !ok { 634 return tcpip.ErrUnknownProtocol 635 } 636 return transProtoState.proto.Option(option) 637 } 638 639 // SetTransportProtocolHandler sets the per-stack default handler for the given 640 // protocol. 641 // 642 // It must be called only during initialization of the stack. Changing it as the 643 // stack is operating is not supported. 644 func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, tcpip.PacketBuffer) bool) { 645 state := s.transportProtocols[p] 646 if state != nil { 647 state.defaultHandler = h 648 } 649 } 650 651 // NowNanoseconds implements tcpip.Clock.NowNanoseconds. 652 func (s *Stack) NowNanoseconds() int64 { 653 return s.clock.NowNanoseconds() 654 } 655 656 // Stats returns a mutable copy of the current stats. 657 // 658 // This is not generally exported via the public interface, but is available 659 // internally. 660 func (s *Stack) Stats() tcpip.Stats { 661 return s.stats 662 } 663 664 // SetForwarding enables or disables the packet forwarding between NICs. 665 func (s *Stack) SetForwarding(enable bool) { 666 // TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward. 667 s.mu.Lock() 668 s.forwarding = enable 669 s.mu.Unlock() 670 } 671 672 // Forwarding returns if the packet forwarding between NICs is enabled. 673 func (s *Stack) Forwarding() bool { 674 // TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward. 675 s.mu.RLock() 676 defer s.mu.RUnlock() 677 return s.forwarding 678 } 679 680 // SetRouteTable assigns the route table to be used by this stack. It 681 // specifies which NIC to use for given destination address ranges. 682 func (s *Stack) SetRouteTable(table []tcpip.Route) { 683 s.mu.Lock() 684 defer s.mu.Unlock() 685 686 s.routeTable = table 687 } 688 689 // GetRouteTable returns the route table which is currently in use. 690 func (s *Stack) GetRouteTable() []tcpip.Route { 691 s.mu.Lock() 692 defer s.mu.Unlock() 693 return append([]tcpip.Route(nil), s.routeTable...) 694 } 695 696 // NewEndpoint creates a new transport layer endpoint of the given protocol. 697 func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { 698 t, ok := s.transportProtocols[transport] 699 if !ok { 700 return nil, tcpip.ErrUnknownProtocol 701 } 702 703 return t.proto.NewEndpoint(s, network, waiterQueue) 704 } 705 706 // NewRawEndpoint creates a new raw transport layer endpoint of the given 707 // protocol. Raw endpoints receive all traffic for a given protocol regardless 708 // of address. 709 func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, *tcpip.Error) { 710 if s.rawFactory == nil { 711 return nil, tcpip.ErrNotPermitted 712 } 713 714 if !associated { 715 return s.rawFactory.NewUnassociatedEndpoint(s, network, transport, waiterQueue) 716 } 717 718 t, ok := s.transportProtocols[transport] 719 if !ok { 720 return nil, tcpip.ErrUnknownProtocol 721 } 722 723 return t.proto.NewRawEndpoint(s, network, waiterQueue) 724 } 725 726 // NewPacketEndpoint creates a new packet endpoint listening for the given 727 // netProto. 728 func (s *Stack) NewPacketEndpoint(cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { 729 if s.rawFactory == nil { 730 return nil, tcpip.ErrNotPermitted 731 } 732 733 return s.rawFactory.NewPacketEndpoint(s, cooked, netProto, waiterQueue) 734 } 735 736 // createNIC creates a NIC with the provided id and link-layer endpoint, and 737 // optionally enable it. 738 func (s *Stack) createNIC(id tcpip.NICID, name string, ep LinkEndpoint, enabled, loopback bool) *tcpip.Error { 739 s.mu.Lock() 740 defer s.mu.Unlock() 741 742 // Make sure id is unique. 743 if _, ok := s.nics[id]; ok { 744 return tcpip.ErrDuplicateNICID 745 } 746 747 n := newNIC(s, id, name, ep, loopback) 748 749 s.nics[id] = n 750 if enabled { 751 return n.enable() 752 } 753 754 return nil 755 } 756 757 // CreateNIC creates a NIC with the provided id and link-layer endpoint. 758 func (s *Stack) CreateNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error { 759 return s.createNIC(id, "", ep, true, false) 760 } 761 762 // CreateNamedNIC creates a NIC with the provided id and link-layer endpoint, 763 // and a human-readable name. 764 func (s *Stack) CreateNamedNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error { 765 return s.createNIC(id, name, ep, true, false) 766 } 767 768 // CreateNamedLoopbackNIC creates a NIC with the provided id and link-layer 769 // endpoint, and a human-readable name. 770 func (s *Stack) CreateNamedLoopbackNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error { 771 return s.createNIC(id, name, ep, true, true) 772 } 773 774 // CreateDisabledNIC creates a NIC with the provided id and link-layer endpoint, 775 // but leave it disable. Stack.EnableNIC must be called before the link-layer 776 // endpoint starts delivering packets to it. 777 func (s *Stack) CreateDisabledNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error { 778 return s.createNIC(id, "", ep, false, false) 779 } 780 781 // CreateDisabledNamedNIC is a combination of CreateNamedNIC and 782 // CreateDisabledNIC. 783 func (s *Stack) CreateDisabledNamedNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error { 784 return s.createNIC(id, name, ep, false, false) 785 } 786 787 // EnableNIC enables the given NIC so that the link-layer endpoint can start 788 // delivering packets to it. 789 func (s *Stack) EnableNIC(id tcpip.NICID) *tcpip.Error { 790 s.mu.RLock() 791 defer s.mu.RUnlock() 792 793 nic := s.nics[id] 794 if nic == nil { 795 return tcpip.ErrUnknownNICID 796 } 797 798 return nic.enable() 799 } 800 801 // CheckNIC checks if a NIC is usable. 802 func (s *Stack) CheckNIC(id tcpip.NICID) bool { 803 s.mu.RLock() 804 nic, ok := s.nics[id] 805 s.mu.RUnlock() 806 if ok { 807 return nic.linkEP.IsAttached() 808 } 809 return false 810 } 811 812 // NICSubnets returns a map of NICIDs to their associated subnets. 813 func (s *Stack) NICAddressRanges() map[tcpip.NICID][]tcpip.Subnet { 814 s.mu.RLock() 815 defer s.mu.RUnlock() 816 817 nics := map[tcpip.NICID][]tcpip.Subnet{} 818 819 for id, nic := range s.nics { 820 nics[id] = append(nics[id], nic.AddressRanges()...) 821 } 822 return nics 823 } 824 825 // NICInfo captures the name and addresses assigned to a NIC. 826 type NICInfo struct { 827 Name string 828 LinkAddress tcpip.LinkAddress 829 ProtocolAddresses []tcpip.ProtocolAddress 830 831 // Flags indicate the state of the NIC. 832 Flags NICStateFlags 833 834 // MTU is the maximum transmission unit. 835 MTU uint32 836 837 Stats NICStats 838 } 839 840 // NICInfo returns a map of NICIDs to their associated information. 841 func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo { 842 s.mu.RLock() 843 defer s.mu.RUnlock() 844 845 nics := make(map[tcpip.NICID]NICInfo) 846 for id, nic := range s.nics { 847 flags := NICStateFlags{ 848 Up: true, // Netstack interfaces are always up. 849 Running: nic.linkEP.IsAttached(), 850 Promiscuous: nic.isPromiscuousMode(), 851 Loopback: nic.linkEP.Capabilities()&CapabilityLoopback != 0, 852 } 853 nics[id] = NICInfo{ 854 Name: nic.name, 855 LinkAddress: nic.linkEP.LinkAddress(), 856 ProtocolAddresses: nic.PrimaryAddresses(), 857 Flags: flags, 858 MTU: nic.linkEP.MTU(), 859 Stats: nic.stats, 860 } 861 } 862 return nics 863 } 864 865 // NICStateFlags holds information about the state of an NIC. 866 type NICStateFlags struct { 867 // Up indicates whether the interface is running. 868 Up bool 869 870 // Running indicates whether resources are allocated. 871 Running bool 872 873 // Promiscuous indicates whether the interface is in promiscuous mode. 874 Promiscuous bool 875 876 // Loopback indicates whether the interface is a loopback. 877 Loopback bool 878 } 879 880 // AddAddress adds a new network-layer address to the specified NIC. 881 func (s *Stack) AddAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error { 882 return s.AddAddressWithOptions(id, protocol, addr, CanBePrimaryEndpoint) 883 } 884 885 // AddProtocolAddress adds a new network-layer protocol address to the 886 // specified NIC. 887 func (s *Stack) AddProtocolAddress(id tcpip.NICID, protocolAddress tcpip.ProtocolAddress) *tcpip.Error { 888 return s.AddProtocolAddressWithOptions(id, protocolAddress, CanBePrimaryEndpoint) 889 } 890 891 // AddAddressWithOptions is the same as AddAddress, but allows you to specify 892 // whether the new endpoint can be primary or not. 893 func (s *Stack) AddAddressWithOptions(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, peb PrimaryEndpointBehavior) *tcpip.Error { 894 netProto, ok := s.networkProtocols[protocol] 895 if !ok { 896 return tcpip.ErrUnknownProtocol 897 } 898 return s.AddProtocolAddressWithOptions(id, tcpip.ProtocolAddress{ 899 Protocol: protocol, 900 AddressWithPrefix: tcpip.AddressWithPrefix{ 901 Address: addr, 902 PrefixLen: netProto.DefaultPrefixLen(), 903 }, 904 }, peb) 905 } 906 907 // AddProtocolAddressWithOptions is the same as AddProtocolAddress, but allows 908 // you to specify whether the new endpoint can be primary or not. 909 func (s *Stack) AddProtocolAddressWithOptions(id tcpip.NICID, protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior) *tcpip.Error { 910 s.mu.RLock() 911 defer s.mu.RUnlock() 912 913 nic := s.nics[id] 914 if nic == nil { 915 return tcpip.ErrUnknownNICID 916 } 917 918 return nic.AddAddress(protocolAddress, peb) 919 } 920 921 // AddAddressRange adds a range of addresses to the specified NIC. The range is 922 // given by a subnet address, and all addresses contained in the subnet are 923 // used except for the subnet address itself and the subnet's broadcast 924 // address. 925 func (s *Stack) AddAddressRange(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, subnet tcpip.Subnet) *tcpip.Error { 926 s.mu.RLock() 927 defer s.mu.RUnlock() 928 929 if nic, ok := s.nics[id]; ok { 930 nic.AddAddressRange(protocol, subnet) 931 return nil 932 } 933 934 return tcpip.ErrUnknownNICID 935 } 936 937 // RemoveAddressRange removes the range of addresses from the specified NIC. 938 func (s *Stack) RemoveAddressRange(id tcpip.NICID, subnet tcpip.Subnet) *tcpip.Error { 939 s.mu.RLock() 940 defer s.mu.RUnlock() 941 942 if nic, ok := s.nics[id]; ok { 943 nic.RemoveAddressRange(subnet) 944 return nil 945 } 946 947 return tcpip.ErrUnknownNICID 948 } 949 950 // RemoveAddress removes an existing network-layer address from the specified 951 // NIC. 952 func (s *Stack) RemoveAddress(id tcpip.NICID, addr tcpip.Address) *tcpip.Error { 953 s.mu.RLock() 954 defer s.mu.RUnlock() 955 956 if nic, ok := s.nics[id]; ok { 957 return nic.RemoveAddress(addr) 958 } 959 960 return tcpip.ErrUnknownNICID 961 } 962 963 // AllAddresses returns a map of NICIDs to their protocol addresses (primary 964 // and non-primary). 965 func (s *Stack) AllAddresses() map[tcpip.NICID][]tcpip.ProtocolAddress { 966 s.mu.RLock() 967 defer s.mu.RUnlock() 968 969 nics := make(map[tcpip.NICID][]tcpip.ProtocolAddress) 970 for id, nic := range s.nics { 971 nics[id] = nic.AllAddresses() 972 } 973 return nics 974 } 975 976 // GetMainNICAddress returns the first primary address and prefix for the given 977 // NIC and protocol. Returns an error if the NIC doesn't exist and an empty 978 // value if the NIC doesn't have a primary address for the given protocol. 979 func (s *Stack) GetMainNICAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber) (tcpip.AddressWithPrefix, *tcpip.Error) { 980 s.mu.RLock() 981 defer s.mu.RUnlock() 982 983 nic, ok := s.nics[id] 984 if !ok { 985 return tcpip.AddressWithPrefix{}, tcpip.ErrUnknownNICID 986 } 987 988 for _, a := range nic.PrimaryAddresses() { 989 if a.Protocol == protocol { 990 return a.AddressWithPrefix, nil 991 } 992 } 993 return tcpip.AddressWithPrefix{}, nil 994 } 995 996 func (s *Stack) getRefEP(nic *NIC, localAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) (ref *referencedNetworkEndpoint) { 997 if len(localAddr) == 0 { 998 return nic.primaryEndpoint(netProto) 999 } 1000 return nic.findEndpoint(netProto, localAddr, CanBePrimaryEndpoint) 1001 } 1002 1003 // FindRoute creates a route to the given destination address, leaving through 1004 // the given nic and local address (if provided). 1005 func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber, multicastLoop bool) (Route, *tcpip.Error) { 1006 s.mu.RLock() 1007 defer s.mu.RUnlock() 1008 1009 isBroadcast := remoteAddr == header.IPv4Broadcast 1010 isMulticast := header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr) 1011 needRoute := !(isBroadcast || isMulticast || header.IsV6LinkLocalAddress(remoteAddr)) 1012 if id != 0 && !needRoute { 1013 if nic, ok := s.nics[id]; ok { 1014 if ref := s.getRefEP(nic, localAddr, netProto); ref != nil { 1015 return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.loopback, multicastLoop && !nic.loopback), nil 1016 } 1017 } 1018 } else { 1019 for _, route := range s.routeTable { 1020 if (id != 0 && id != route.NIC) || (len(remoteAddr) != 0 && !route.Destination.Contains(remoteAddr)) { 1021 continue 1022 } 1023 if nic, ok := s.nics[route.NIC]; ok { 1024 if ref := s.getRefEP(nic, localAddr, netProto); ref != nil { 1025 if len(remoteAddr) == 0 { 1026 // If no remote address was provided, then the route 1027 // provided will refer to the link local address. 1028 remoteAddr = ref.ep.ID().LocalAddress 1029 } 1030 1031 r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.loopback, multicastLoop && !nic.loopback) 1032 if needRoute { 1033 r.NextHop = route.Gateway 1034 } 1035 return r, nil 1036 } 1037 } 1038 } 1039 } 1040 1041 if !needRoute { 1042 return Route{}, tcpip.ErrNetworkUnreachable 1043 } 1044 1045 return Route{}, tcpip.ErrNoRoute 1046 } 1047 1048 // CheckNetworkProtocol checks if a given network protocol is enabled in the 1049 // stack. 1050 func (s *Stack) CheckNetworkProtocol(protocol tcpip.NetworkProtocolNumber) bool { 1051 _, ok := s.networkProtocols[protocol] 1052 return ok 1053 } 1054 1055 // CheckLocalAddress determines if the given local address exists, and if it 1056 // does, returns the id of the NIC it's bound to. Returns 0 if the address 1057 // does not exist. 1058 func (s *Stack) CheckLocalAddress(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.NICID { 1059 s.mu.RLock() 1060 defer s.mu.RUnlock() 1061 1062 // If a NIC is specified, we try to find the address there only. 1063 if nicID != 0 { 1064 nic := s.nics[nicID] 1065 if nic == nil { 1066 return 0 1067 } 1068 1069 ref := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint) 1070 if ref == nil { 1071 return 0 1072 } 1073 1074 ref.decRef() 1075 1076 return nic.id 1077 } 1078 1079 // Go through all the NICs. 1080 for _, nic := range s.nics { 1081 ref := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint) 1082 if ref != nil { 1083 ref.decRef() 1084 return nic.id 1085 } 1086 } 1087 1088 return 0 1089 } 1090 1091 // SetPromiscuousMode enables or disables promiscuous mode in the given NIC. 1092 func (s *Stack) SetPromiscuousMode(nicID tcpip.NICID, enable bool) *tcpip.Error { 1093 s.mu.RLock() 1094 defer s.mu.RUnlock() 1095 1096 nic := s.nics[nicID] 1097 if nic == nil { 1098 return tcpip.ErrUnknownNICID 1099 } 1100 1101 nic.setPromiscuousMode(enable) 1102 1103 return nil 1104 } 1105 1106 // SetSpoofing enables or disables address spoofing in the given NIC, allowing 1107 // endpoints to bind to any address in the NIC. 1108 func (s *Stack) SetSpoofing(nicID tcpip.NICID, enable bool) *tcpip.Error { 1109 s.mu.RLock() 1110 defer s.mu.RUnlock() 1111 1112 nic := s.nics[nicID] 1113 if nic == nil { 1114 return tcpip.ErrUnknownNICID 1115 } 1116 1117 nic.setSpoofing(enable) 1118 1119 return nil 1120 } 1121 1122 // AddLinkAddress adds a link address to the stack link cache. 1123 func (s *Stack) AddLinkAddress(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress) { 1124 fullAddr := tcpip.FullAddress{NIC: nicID, Addr: addr} 1125 s.linkAddrCache.add(fullAddr, linkAddr) 1126 // TODO: provide a way for a transport endpoint to receive a signal 1127 // that AddLinkAddress for a particular address has been called. 1128 } 1129 1130 // GetLinkAddress implements LinkAddressCache.GetLinkAddress. 1131 func (s *Stack) GetLinkAddress(nicID tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, waker *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) { 1132 s.mu.RLock() 1133 nic := s.nics[nicID] 1134 if nic == nil { 1135 s.mu.RUnlock() 1136 return "", nil, tcpip.ErrUnknownNICID 1137 } 1138 s.mu.RUnlock() 1139 1140 fullAddr := tcpip.FullAddress{NIC: nicID, Addr: addr} 1141 linkRes := s.linkAddrResolvers[protocol] 1142 return s.linkAddrCache.get(fullAddr, linkRes, localAddr, nic.linkEP, waker) 1143 } 1144 1145 // RemoveWaker implements LinkAddressCache.RemoveWaker. 1146 func (s *Stack) RemoveWaker(nicID tcpip.NICID, addr tcpip.Address, waker *sleep.Waker) { 1147 s.mu.RLock() 1148 defer s.mu.RUnlock() 1149 1150 if nic := s.nics[nicID]; nic == nil { 1151 fullAddr := tcpip.FullAddress{NIC: nicID, Addr: addr} 1152 s.linkAddrCache.removeWaker(fullAddr, waker) 1153 } 1154 } 1155 1156 // RegisterTransportEndpoint registers the given endpoint with the stack 1157 // transport dispatcher. Received packets that match the provided id will be 1158 // delivered to the given endpoint; specifying a nic is optional, but 1159 // nic-specific IDs have precedence over global ones. 1160 func (s *Stack) RegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error { 1161 return s.demux.registerEndpoint(netProtos, protocol, id, ep, reusePort, bindToDevice) 1162 } 1163 1164 // UnregisterTransportEndpoint removes the endpoint with the given id from the 1165 // stack transport dispatcher. 1166 func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) { 1167 s.demux.unregisterEndpoint(netProtos, protocol, id, ep, bindToDevice) 1168 } 1169 1170 // StartTransportEndpointCleanup removes the endpoint with the given id from 1171 // the stack transport dispatcher. It also transitions it to the cleanup stage. 1172 func (s *Stack) StartTransportEndpointCleanup(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) { 1173 s.mu.Lock() 1174 defer s.mu.Unlock() 1175 1176 s.cleanupEndpoints[ep] = struct{}{} 1177 1178 s.demux.unregisterEndpoint(netProtos, protocol, id, ep, bindToDevice) 1179 } 1180 1181 // CompleteTransportEndpointCleanup removes the endpoint from the cleanup 1182 // stage. 1183 func (s *Stack) CompleteTransportEndpointCleanup(ep TransportEndpoint) { 1184 s.mu.Lock() 1185 delete(s.cleanupEndpoints, ep) 1186 s.mu.Unlock() 1187 } 1188 1189 // FindTransportEndpoint finds an endpoint that most closely matches the provided 1190 // id. If no endpoint is found it returns nil. 1191 func (s *Stack) FindTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, id TransportEndpointID, r *Route) TransportEndpoint { 1192 return s.demux.findTransportEndpoint(netProto, transProto, id, r) 1193 } 1194 1195 // RegisterRawTransportEndpoint registers the given endpoint with the stack 1196 // transport dispatcher. Received packets that match the provided transport 1197 // protocol will be delivered to the given endpoint. 1198 func (s *Stack) RegisterRawTransportEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) *tcpip.Error { 1199 return s.demux.registerRawEndpoint(netProto, transProto, ep) 1200 } 1201 1202 // UnregisterRawTransportEndpoint removes the endpoint for the transport 1203 // protocol from the stack transport dispatcher. 1204 func (s *Stack) UnregisterRawTransportEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) { 1205 s.demux.unregisterRawEndpoint(netProto, transProto, ep) 1206 } 1207 1208 // RegisterRestoredEndpoint records e as an endpoint that has been restored on 1209 // this stack. 1210 func (s *Stack) RegisterRestoredEndpoint(e ResumableEndpoint) { 1211 s.mu.Lock() 1212 s.resumableEndpoints = append(s.resumableEndpoints, e) 1213 s.mu.Unlock() 1214 } 1215 1216 // RegisteredEndpoints returns all endpoints which are currently registered. 1217 func (s *Stack) RegisteredEndpoints() []TransportEndpoint { 1218 s.mu.Lock() 1219 defer s.mu.Unlock() 1220 var es []TransportEndpoint 1221 for _, e := range s.demux.protocol { 1222 es = append(es, e.transportEndpoints()...) 1223 } 1224 return es 1225 } 1226 1227 // CleanupEndpoints returns endpoints currently in the cleanup state. 1228 func (s *Stack) CleanupEndpoints() []TransportEndpoint { 1229 s.mu.Lock() 1230 es := make([]TransportEndpoint, 0, len(s.cleanupEndpoints)) 1231 for e := range s.cleanupEndpoints { 1232 es = append(es, e) 1233 } 1234 s.mu.Unlock() 1235 return es 1236 } 1237 1238 // RestoreCleanupEndpoints adds endpoints to cleanup tracking. This is useful 1239 // for restoring a stack after a save. 1240 func (s *Stack) RestoreCleanupEndpoints(es []TransportEndpoint) { 1241 s.mu.Lock() 1242 for _, e := range es { 1243 s.cleanupEndpoints[e] = struct{}{} 1244 } 1245 s.mu.Unlock() 1246 } 1247 1248 // Close closes all currently registered transport endpoints. 1249 // 1250 // Endpoints created or modified during this call may not get closed. 1251 func (s *Stack) Close() { 1252 for _, e := range s.RegisteredEndpoints() { 1253 e.Close() 1254 } 1255 } 1256 1257 // Wait waits for all transport and link endpoints to halt their worker 1258 // goroutines. 1259 // 1260 // Endpoints created or modified during this call may not get waited on. 1261 // 1262 // Note that link endpoints must be stopped via an implementation specific 1263 // mechanism. 1264 func (s *Stack) Wait() { 1265 for _, e := range s.RegisteredEndpoints() { 1266 e.Wait() 1267 } 1268 for _, e := range s.CleanupEndpoints() { 1269 e.Wait() 1270 } 1271 1272 s.mu.RLock() 1273 defer s.mu.RUnlock() 1274 for _, n := range s.nics { 1275 n.linkEP.Wait() 1276 } 1277 } 1278 1279 // Resume restarts the stack after a restore. This must be called after the 1280 // entire system has been restored. 1281 func (s *Stack) Resume() { 1282 // ResumableEndpoint.Resume() may call other methods on s, so we can't hold 1283 // s.mu while resuming the endpoints. 1284 s.mu.Lock() 1285 eps := s.resumableEndpoints 1286 s.resumableEndpoints = nil 1287 s.mu.Unlock() 1288 for _, e := range eps { 1289 e.Resume(s) 1290 } 1291 } 1292 1293 // RegisterPacketEndpoint registers ep with the stack, causing it to receive 1294 // all traffic of the specified netProto on the given NIC. If nicID is 0, it 1295 // receives traffic from every NIC. 1296 func (s *Stack) RegisterPacketEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) *tcpip.Error { 1297 s.mu.Lock() 1298 defer s.mu.Unlock() 1299 1300 // If no NIC is specified, capture on all devices. 1301 if nicID == 0 { 1302 // Register with each NIC. 1303 for _, nic := range s.nics { 1304 if err := nic.registerPacketEndpoint(netProto, ep); err != nil { 1305 s.unregisterPacketEndpointLocked(0, netProto, ep) 1306 return err 1307 } 1308 } 1309 return nil 1310 } 1311 1312 // Capture on a specific device. 1313 nic, ok := s.nics[nicID] 1314 if !ok { 1315 return tcpip.ErrUnknownNICID 1316 } 1317 if err := nic.registerPacketEndpoint(netProto, ep); err != nil { 1318 return err 1319 } 1320 1321 return nil 1322 } 1323 1324 // UnregisterPacketEndpoint unregisters ep for packets of the specified 1325 // netProto from the specified NIC. If nicID is 0, ep is unregistered from all 1326 // NICs. 1327 func (s *Stack) UnregisterPacketEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) { 1328 s.mu.Lock() 1329 defer s.mu.Unlock() 1330 s.unregisterPacketEndpointLocked(nicID, netProto, ep) 1331 } 1332 1333 func (s *Stack) unregisterPacketEndpointLocked(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) { 1334 // If no NIC is specified, unregister on all devices. 1335 if nicID == 0 { 1336 // Unregister with each NIC. 1337 for _, nic := range s.nics { 1338 nic.unregisterPacketEndpoint(netProto, ep) 1339 } 1340 return 1341 } 1342 1343 // Unregister in a single device. 1344 nic, ok := s.nics[nicID] 1345 if !ok { 1346 return 1347 } 1348 nic.unregisterPacketEndpoint(netProto, ep) 1349 } 1350 1351 // WritePacket writes data directly to the specified NIC. It adds an ethernet 1352 // header based on the arguments. 1353 func (s *Stack) WritePacket(nicID tcpip.NICID, dst tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, payload buffer.VectorisedView) *tcpip.Error { 1354 s.mu.Lock() 1355 nic, ok := s.nics[nicID] 1356 s.mu.Unlock() 1357 if !ok { 1358 return tcpip.ErrUnknownDevice 1359 } 1360 1361 // Add our own fake ethernet header. 1362 ethFields := header.EthernetFields{ 1363 SrcAddr: nic.linkEP.LinkAddress(), 1364 DstAddr: dst, 1365 Type: netProto, 1366 } 1367 fakeHeader := make(header.Ethernet, header.EthernetMinimumSize) 1368 fakeHeader.Encode(ðFields) 1369 vv := buffer.View(fakeHeader).ToVectorisedView() 1370 vv.Append(payload) 1371 1372 if err := nic.linkEP.WriteRawPacket(vv); err != nil { 1373 return err 1374 } 1375 1376 return nil 1377 } 1378 1379 // WriteRawPacket writes data directly to the specified NIC without adding any 1380 // headers. 1381 func (s *Stack) WriteRawPacket(nicID tcpip.NICID, payload buffer.VectorisedView) *tcpip.Error { 1382 s.mu.Lock() 1383 nic, ok := s.nics[nicID] 1384 s.mu.Unlock() 1385 if !ok { 1386 return tcpip.ErrUnknownDevice 1387 } 1388 1389 if err := nic.linkEP.WriteRawPacket(payload); err != nil { 1390 return err 1391 } 1392 1393 return nil 1394 } 1395 1396 // NetworkProtocolInstance returns the protocol instance in the stack for the 1397 // specified network protocol. This method is public for protocol implementers 1398 // and tests to use. 1399 func (s *Stack) NetworkProtocolInstance(num tcpip.NetworkProtocolNumber) NetworkProtocol { 1400 if p, ok := s.networkProtocols[num]; ok { 1401 return p 1402 } 1403 return nil 1404 } 1405 1406 // TransportProtocolInstance returns the protocol instance in the stack for the 1407 // specified transport protocol. This method is public for protocol implementers 1408 // and tests to use. 1409 func (s *Stack) TransportProtocolInstance(num tcpip.TransportProtocolNumber) TransportProtocol { 1410 if pState, ok := s.transportProtocols[num]; ok { 1411 return pState.proto 1412 } 1413 return nil 1414 } 1415 1416 // AddTCPProbe installs a probe function that will be invoked on every segment 1417 // received by a given TCP endpoint. The probe function is passed a copy of the 1418 // TCP endpoint state before and after processing of the segment. 1419 // 1420 // NOTE: TCPProbe is added only to endpoints created after this call. Endpoints 1421 // created prior to this call will not call the probe function. 1422 // 1423 // Further, installing two different probes back to back can result in some 1424 // endpoints calling the first one and some the second one. There is no 1425 // guarantee provided on which probe will be invoked. Ideally this should only 1426 // be called once per stack. 1427 func (s *Stack) AddTCPProbe(probe TCPProbeFunc) { 1428 s.mu.Lock() 1429 s.tcpProbeFunc = probe 1430 s.mu.Unlock() 1431 } 1432 1433 // GetTCPProbe returns the TCPProbeFunc if installed with AddTCPProbe, nil 1434 // otherwise. 1435 func (s *Stack) GetTCPProbe() TCPProbeFunc { 1436 s.mu.Lock() 1437 p := s.tcpProbeFunc 1438 s.mu.Unlock() 1439 return p 1440 } 1441 1442 // RemoveTCPProbe removes an installed TCP probe. 1443 // 1444 // NOTE: This only ensures that endpoints created after this call do not 1445 // have a probe attached. Endpoints already created will continue to invoke 1446 // TCP probe. 1447 func (s *Stack) RemoveTCPProbe() { 1448 s.mu.Lock() 1449 s.tcpProbeFunc = nil 1450 s.mu.Unlock() 1451 } 1452 1453 // JoinGroup joins the given multicast group on the given NIC. 1454 func (s *Stack) JoinGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NICID, multicastAddr tcpip.Address) *tcpip.Error { 1455 // TODO: notify network of subscription via igmp protocol. 1456 s.mu.RLock() 1457 defer s.mu.RUnlock() 1458 1459 if nic, ok := s.nics[nicID]; ok { 1460 return nic.joinGroup(protocol, multicastAddr) 1461 } 1462 return tcpip.ErrUnknownNICID 1463 } 1464 1465 // LeaveGroup leaves the given multicast group on the given NIC. 1466 func (s *Stack) LeaveGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NICID, multicastAddr tcpip.Address) *tcpip.Error { 1467 s.mu.RLock() 1468 defer s.mu.RUnlock() 1469 1470 if nic, ok := s.nics[nicID]; ok { 1471 return nic.leaveGroup(multicastAddr) 1472 } 1473 return tcpip.ErrUnknownNICID 1474 } 1475 1476 // IPTables returns the stack's iptables. 1477 func (s *Stack) IPTables() iptables.IPTables { 1478 return s.tables 1479 } 1480 1481 // SetIPTables sets the stack's iptables. 1482 func (s *Stack) SetIPTables(ipt iptables.IPTables) { 1483 s.tables = ipt 1484 } 1485 1486 // ICMPLimit returns the maximum number of ICMP messages that can be sent 1487 // in one second. 1488 func (s *Stack) ICMPLimit() rate.Limit { 1489 return s.icmpRateLimiter.Limit() 1490 } 1491 1492 // SetICMPLimit sets the maximum number of ICMP messages that be sent 1493 // in one second. 1494 func (s *Stack) SetICMPLimit(newLimit rate.Limit) { 1495 s.icmpRateLimiter.SetLimit(newLimit) 1496 } 1497 1498 // ICMPBurst returns the maximum number of ICMP messages that can be sent 1499 // in a single burst. 1500 func (s *Stack) ICMPBurst() int { 1501 return s.icmpRateLimiter.Burst() 1502 } 1503 1504 // SetICMPBurst sets the maximum number of ICMP messages that can be sent 1505 // in a single burst. 1506 func (s *Stack) SetICMPBurst(burst int) { 1507 s.icmpRateLimiter.SetBurst(burst) 1508 } 1509 1510 // AllowICMPMessage returns true if we the rate limiter allows at least one 1511 // ICMP message to be sent at this instant. 1512 func (s *Stack) AllowICMPMessage() bool { 1513 return s.icmpRateLimiter.Allow() 1514 } 1515 1516 // IsAddrTentative returns true if addr is tentative on the NIC with ID id. 1517 // 1518 // Note that if addr is not associated with a NIC with id ID, then this 1519 // function will return false. It will only return true if the address is 1520 // associated with the NIC AND it is tentative. 1521 func (s *Stack) IsAddrTentative(id tcpip.NICID, addr tcpip.Address) (bool, *tcpip.Error) { 1522 s.mu.RLock() 1523 defer s.mu.RUnlock() 1524 1525 nic, ok := s.nics[id] 1526 if !ok { 1527 return false, tcpip.ErrUnknownNICID 1528 } 1529 1530 return nic.isAddrTentative(addr), nil 1531 } 1532 1533 // DupTentativeAddrDetected attempts to inform the NIC with ID id that a 1534 // tentative addr on it is a duplicate on a link. 1535 func (s *Stack) DupTentativeAddrDetected(id tcpip.NICID, addr tcpip.Address) *tcpip.Error { 1536 s.mu.Lock() 1537 defer s.mu.Unlock() 1538 1539 nic, ok := s.nics[id] 1540 if !ok { 1541 return tcpip.ErrUnknownNICID 1542 } 1543 1544 return nic.dupTentativeAddrDetected(addr) 1545 } 1546 1547 // SetNDPConfigurations sets the per-interface NDP configurations on the NIC 1548 // with ID id to c. 1549 // 1550 // Note, if c contains invalid NDP configuration values, it will be fixed to 1551 // use default values for the erroneous values. 1552 func (s *Stack) SetNDPConfigurations(id tcpip.NICID, c NDPConfigurations) *tcpip.Error { 1553 s.mu.Lock() 1554 defer s.mu.Unlock() 1555 1556 nic, ok := s.nics[id] 1557 if !ok { 1558 return tcpip.ErrUnknownNICID 1559 } 1560 1561 nic.setNDPConfigs(c) 1562 1563 return nil 1564 } 1565 1566 // HandleNDPRA provides a NIC with ID id a validated NDP Router Advertisement 1567 // message that it needs to handle. 1568 func (s *Stack) HandleNDPRA(id tcpip.NICID, ip tcpip.Address, ra header.NDPRouterAdvert) *tcpip.Error { 1569 s.mu.Lock() 1570 defer s.mu.Unlock() 1571 1572 nic, ok := s.nics[id] 1573 if !ok { 1574 return tcpip.ErrUnknownNICID 1575 } 1576 1577 nic.handleNDPRA(ip, ra) 1578 1579 return nil 1580 } 1581 1582 // Seed returns a 32 bit value that can be used as a seed value for port 1583 // picking, ISN generation etc. 1584 // 1585 // NOTE: The seed is generated once during stack initialization only. 1586 func (s *Stack) Seed() uint32 { 1587 return s.seed 1588 } 1589 1590 func generateRandUint32() uint32 { 1591 b := make([]byte, 4) 1592 if _, err := rand.Read(b); err != nil { 1593 panic(err) 1594 } 1595 return binary.LittleEndian.Uint32(b) 1596 }