gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/boot/network.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package boot 16 17 import ( 18 "fmt" 19 "io" 20 "net" 21 "os" 22 "runtime" 23 "strings" 24 25 "golang.org/x/sys/unix" 26 "gvisor.dev/gvisor/pkg/hostos" 27 "gvisor.dev/gvisor/pkg/log" 28 "gvisor.dev/gvisor/pkg/sentry/kernel" 29 "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" 30 "gvisor.dev/gvisor/pkg/tcpip" 31 "gvisor.dev/gvisor/pkg/tcpip/link/ethernet" 32 "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" 33 "gvisor.dev/gvisor/pkg/tcpip/link/loopback" 34 "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo" 35 "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" 36 "gvisor.dev/gvisor/pkg/tcpip/link/xdp" 37 "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" 38 "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" 39 "gvisor.dev/gvisor/pkg/tcpip/stack" 40 "gvisor.dev/gvisor/pkg/urpc" 41 "gvisor.dev/gvisor/runsc/config" 42 ) 43 44 var ( 45 // DefaultLoopbackLink contains IP addresses and routes of "127.0.0.1/8" and 46 // "::1/8" on "lo" interface. 47 DefaultLoopbackLink = LoopbackLink{ 48 Name: "lo", 49 Addresses: []IPWithPrefix{ 50 {Address: net.IP("\x7f\x00\x00\x01"), PrefixLen: 8}, 51 {Address: net.IPv6loopback, PrefixLen: 128}, 52 }, 53 Routes: []Route{ 54 { 55 Destination: net.IPNet{ 56 IP: net.IPv4(0x7f, 0, 0, 0), 57 Mask: net.IPv4Mask(0xff, 0, 0, 0), 58 }, 59 }, 60 { 61 Destination: net.IPNet{ 62 IP: net.IPv6loopback, 63 Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)), 64 }, 65 }, 66 }, 67 } 68 ) 69 70 // Network exposes methods that can be used to configure a network stack. 71 type Network struct { 72 Stack *stack.Stack 73 Kernel *kernel.Kernel 74 } 75 76 // Route represents a route in the network stack. 77 type Route struct { 78 Destination net.IPNet 79 Gateway net.IP 80 } 81 82 // DefaultRoute represents a catch all route to the default gateway. 83 type DefaultRoute struct { 84 Route Route 85 Name string 86 } 87 88 type Neighbor struct { 89 IP net.IP 90 HardwareAddr net.HardwareAddr 91 } 92 93 // FDBasedLink configures an fd-based link. 94 type FDBasedLink struct { 95 Name string 96 InterfaceIndex int 97 MTU int 98 Addresses []IPWithPrefix 99 Routes []Route 100 GSOMaxSize uint32 101 GVisorGSOEnabled bool 102 GVisorGRO bool 103 TXChecksumOffload bool 104 RXChecksumOffload bool 105 LinkAddress net.HardwareAddr 106 QDisc config.QueueingDiscipline 107 Neighbors []Neighbor 108 109 // NumChannels controls how many underlying FDs are to be used to 110 // create this endpoint. 111 NumChannels int 112 113 // ProcessorsPerChannel controls how many goroutines are used to handle 114 // packets on each channel. 115 ProcessorsPerChannel int 116 } 117 118 // BindOpt indicates whether the sentry or runsc process is responsible for 119 // binding the AF_XDP socket. 120 type BindOpt int 121 122 const ( 123 // BindSentry indicates the sentry process must call bind. 124 BindSentry BindOpt = iota 125 126 // BindRunsc indicates the runsc process must call bind. 127 BindRunsc 128 ) 129 130 // XDPLink configures an XDP link. 131 type XDPLink struct { 132 Name string 133 InterfaceIndex int 134 MTU int 135 Addresses []IPWithPrefix 136 Routes []Route 137 TXChecksumOffload bool 138 RXChecksumOffload bool 139 LinkAddress net.HardwareAddr 140 QDisc config.QueueingDiscipline 141 Neighbors []Neighbor 142 GVisorGRO bool 143 Bind BindOpt 144 145 // NumChannels controls how many underlying FDs are to be used to 146 // create this endpoint. 147 NumChannels int 148 } 149 150 // LoopbackLink configures a loopback link. 151 type LoopbackLink struct { 152 Name string 153 Addresses []IPWithPrefix 154 Routes []Route 155 GVisorGRO bool 156 } 157 158 // CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes. 159 type CreateLinksAndRoutesArgs struct { 160 // FilePayload contains the fds associated with the FDBasedLinks. The 161 // number of fd's should match the sum of the NumChannels field of the 162 // FDBasedLink entries below. 163 urpc.FilePayload 164 165 LoopbackLinks []LoopbackLink 166 FDBasedLinks []FDBasedLink 167 XDPLinks []XDPLink 168 169 Defaultv4Gateway DefaultRoute 170 Defaultv6Gateway DefaultRoute 171 172 // PCAP indicates that FilePayload also contains a PCAP log file. 173 PCAP bool 174 175 // LogPackets indicates that packets should be logged. 176 LogPackets bool 177 178 // NATBlob indicates whether FilePayload also contains an iptables NAT 179 // ruleset. 180 NATBlob bool 181 182 // DisconnectOk indicates that link endpoints should have the capability 183 // CapabilityDisconnectOk set. 184 DisconnectOk bool 185 } 186 187 // IPWithPrefix is an address with its subnet prefix length. 188 type IPWithPrefix struct { 189 // Address is a network address. 190 Address net.IP 191 192 // PrefixLen is the subnet prefix length. 193 PrefixLen int 194 } 195 196 func (ip IPWithPrefix) String() string { 197 return fmt.Sprintf("%s/%d", ip.Address, ip.PrefixLen) 198 } 199 200 // Empty returns true if route hasn't been set. 201 func (r *Route) Empty() bool { 202 return r.Destination.IP == nil && r.Destination.Mask == nil && r.Gateway == nil 203 } 204 205 func (r *Route) toTcpipRoute(id tcpip.NICID) (tcpip.Route, error) { 206 subnet, err := tcpip.NewSubnet(ipToAddress(r.Destination.IP), ipMaskToAddressMask(r.Destination.Mask)) 207 if err != nil { 208 return tcpip.Route{}, err 209 } 210 return tcpip.Route{ 211 Destination: subnet, 212 Gateway: ipToAddress(r.Gateway), 213 NIC: id, 214 }, nil 215 } 216 217 // CreateLinksAndRoutes creates links and routes in a network stack. It should 218 // only be called once. 219 func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error { 220 if len(args.FDBasedLinks) > 0 && len(args.XDPLinks) > 0 { 221 return fmt.Errorf("received both fdbased and XDP links, but only one can be used at a time") 222 } 223 wantFDs := 0 224 for _, l := range args.FDBasedLinks { 225 wantFDs += l.NumChannels 226 } 227 for _, link := range args.XDPLinks { 228 // We have to keep several FDs alive when the sentry is 229 // responsible for binding, but when runsc binds we only expect 230 // the AF_XDP socket itself. 231 switch v := link.Bind; v { 232 case BindSentry: 233 wantFDs += 4 234 case BindRunsc: 235 wantFDs++ 236 default: 237 return fmt.Errorf("unknown bind value: %d", v) 238 } 239 } 240 if args.PCAP { 241 wantFDs++ 242 } 243 if args.NATBlob { 244 wantFDs++ 245 } 246 if got := len(args.FilePayload.Files); got != wantFDs { 247 return fmt.Errorf("args.FilePayload.Files has %d FDs but we need %d entries based on FDBasedLinks, XDPLinks, and PCAP", got, wantFDs) 248 } 249 250 var nicID tcpip.NICID 251 nicids := make(map[string]tcpip.NICID) 252 253 // Collect routes from all links. 254 var routes []tcpip.Route 255 256 // Loopback normally appear before other interfaces. 257 for _, link := range args.LoopbackLinks { 258 nicID++ 259 nicids[link.Name] = nicID 260 261 linkEP := ethernet.New(loopback.New()) 262 263 log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses) 264 opts := stack.NICOptions{ 265 Name: link.Name, 266 DeliverLinkPackets: true, 267 } 268 if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil { 269 return err 270 } 271 272 // Collect the routes from this link. 273 for _, r := range link.Routes { 274 route, err := r.toTcpipRoute(nicID) 275 if err != nil { 276 return err 277 } 278 routes = append(routes, route) 279 } 280 } 281 282 // Setup fdbased or XDP links. 283 fdOffset := 0 284 if len(args.FDBasedLinks) > 0 { 285 // Choose a dispatch mode. 286 dispatchMode := fdbased.RecvMMsg 287 version, err := hostos.KernelVersion() 288 if err != nil { 289 return err 290 } 291 if version.AtLeast(5, 6) { 292 // TODO(b/333120887): Switch back to using the packet mmap dispatcher when 293 // we have the performance data to justify it. 294 // dispatchMode = fdbased.PacketMMap 295 // log.Infof("Host kernel version >= 5.6, using to packet mmap to dispatch") 296 } else { 297 log.Infof("Host kernel version < 5.6, using to RecvMMsg to dispatch") 298 } 299 300 for _, link := range args.FDBasedLinks { 301 nicID++ 302 nicids[link.Name] = nicID 303 304 FDs := make([]int, 0, link.NumChannels) 305 for j := 0; j < link.NumChannels; j++ { 306 // Copy the underlying FD. 307 oldFD := args.FilePayload.Files[fdOffset].Fd() 308 newFD, err := unix.Dup(int(oldFD)) 309 if err != nil { 310 return fmt.Errorf("failed to dup FD %v: %v", oldFD, err) 311 } 312 FDs = append(FDs, newFD) 313 fdOffset++ 314 } 315 316 mac := tcpip.LinkAddress(link.LinkAddress) 317 log.Infof("gso max size is: %d", link.GSOMaxSize) 318 319 linkEP, err := fdbased.New(&fdbased.Options{ 320 FDs: FDs, 321 MTU: uint32(link.MTU), 322 EthernetHeader: mac != "", 323 Address: mac, 324 PacketDispatchMode: dispatchMode, 325 GSOMaxSize: link.GSOMaxSize, 326 GVisorGSOEnabled: link.GVisorGSOEnabled, 327 TXChecksumOffload: link.TXChecksumOffload, 328 RXChecksumOffload: link.RXChecksumOffload, 329 GRO: link.GVisorGRO, 330 ProcessorsPerChannel: link.ProcessorsPerChannel, 331 DisconnectOk: args.DisconnectOk, 332 }) 333 if err != nil { 334 return err 335 } 336 337 // Setup packet logging if requested. 338 if args.PCAP { 339 newFD, err := unix.Dup(int(args.FilePayload.Files[fdOffset].Fd())) 340 if err != nil { 341 return fmt.Errorf("failed to dup pcap FD: %v", err) 342 } 343 const packetTruncateSize = 4096 344 linkEP, err = sniffer.NewWithWriter(linkEP, os.NewFile(uintptr(newFD), "pcap-file"), packetTruncateSize) 345 if err != nil { 346 return fmt.Errorf("failed to create PCAP logger: %v", err) 347 } 348 fdOffset++ 349 } else if args.LogPackets { 350 linkEP = sniffer.New(linkEP) 351 } 352 353 var qDisc stack.QueueingDiscipline 354 switch link.QDisc { 355 case config.QDiscNone: 356 case config.QDiscFIFO: 357 log.Infof("Enabling FIFO QDisc on %q", link.Name) 358 qDisc = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000) 359 } 360 361 log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) 362 opts := stack.NICOptions{ 363 Name: link.Name, 364 QDisc: qDisc, 365 DeliverLinkPackets: true, 366 } 367 if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil { 368 return err 369 } 370 371 // Collect the routes from this link. 372 for _, r := range link.Routes { 373 route, err := r.toTcpipRoute(nicID) 374 if err != nil { 375 return err 376 } 377 routes = append(routes, route) 378 } 379 380 for _, neigh := range link.Neighbors { 381 proto, tcpipAddr := ipToAddressAndProto(neigh.IP) 382 n.Stack.AddStaticNeighbor(nicID, proto, tcpipAddr, tcpip.LinkAddress(neigh.HardwareAddr)) 383 } 384 } 385 } else if len(args.XDPLinks) > 0 { 386 if nlinks := len(args.XDPLinks); nlinks > 1 { 387 return fmt.Errorf("XDP only supports one link device, but got %d", nlinks) 388 } 389 link := args.XDPLinks[0] 390 nicID++ 391 nicids[link.Name] = nicID 392 393 // Get the AF_XDP socket. 394 oldFD := args.FilePayload.Files[fdOffset].Fd() 395 fd, err := unix.Dup(int(oldFD)) 396 if err != nil { 397 return fmt.Errorf("failed to dup AF_XDP fd %v: %v", oldFD, err) 398 } 399 fdOffset++ 400 401 // When the sentry is responsible for binding, the runsc 402 // process sends several other FDs in order to keep them open 403 // and alive. These are for BPF programs and maps that, if 404 // closed, will break the dispatcher. 405 if link.Bind == BindSentry { 406 for _, fdName := range []string{"program-fd", "sockmap-fd", "link-fd"} { 407 oldFD := args.FilePayload.Files[fdOffset].Fd() 408 if _, err := unix.Dup(int(oldFD)); err != nil { 409 return fmt.Errorf("failed to dup %s with FD %d: %v", fdName, oldFD, err) 410 } 411 fdOffset++ 412 } 413 } 414 415 // Setup packet logging if requested. 416 mac := tcpip.LinkAddress(link.LinkAddress) 417 linkEP, err := xdp.New(&xdp.Options{ 418 FD: fd, 419 Address: mac, 420 TXChecksumOffload: link.TXChecksumOffload, 421 RXChecksumOffload: link.RXChecksumOffload, 422 InterfaceIndex: link.InterfaceIndex, 423 Bind: link.Bind == BindSentry, 424 GRO: link.GVisorGRO, 425 DisconnectOk: args.DisconnectOk, 426 }) 427 if err != nil { 428 return err 429 } 430 431 if args.PCAP { 432 newFD, err := unix.Dup(int(args.FilePayload.Files[fdOffset].Fd())) 433 if err != nil { 434 return fmt.Errorf("failed to dup pcap FD: %v", err) 435 } 436 const packetTruncateSize = 4096 437 linkEP, err = sniffer.NewWithWriter(linkEP, os.NewFile(uintptr(newFD), "pcap-file"), packetTruncateSize) 438 if err != nil { 439 return fmt.Errorf("failed to create PCAP logger: %v", err) 440 } 441 fdOffset++ 442 } else if args.LogPackets { 443 linkEP = sniffer.New(linkEP) 444 } 445 446 var qDisc stack.QueueingDiscipline 447 switch link.QDisc { 448 case config.QDiscNone: 449 case config.QDiscFIFO: 450 log.Infof("Enabling FIFO QDisc on %q", link.Name) 451 qDisc = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000) 452 } 453 454 log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) 455 opts := stack.NICOptions{ 456 Name: link.Name, 457 QDisc: qDisc, 458 DeliverLinkPackets: true, 459 } 460 if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil { 461 return err 462 } 463 464 // Collect the routes from this link. 465 for _, r := range link.Routes { 466 route, err := r.toTcpipRoute(nicID) 467 if err != nil { 468 return err 469 } 470 routes = append(routes, route) 471 } 472 473 for _, neigh := range link.Neighbors { 474 proto, tcpipAddr := ipToAddressAndProto(neigh.IP) 475 n.Stack.AddStaticNeighbor(nicID, proto, tcpipAddr, tcpip.LinkAddress(neigh.HardwareAddr)) 476 } 477 } 478 479 if !args.Defaultv4Gateway.Route.Empty() { 480 nicID, ok := nicids[args.Defaultv4Gateway.Name] 481 if !ok { 482 return fmt.Errorf("invalid interface name %q for default route", args.Defaultv4Gateway.Name) 483 } 484 route, err := args.Defaultv4Gateway.Route.toTcpipRoute(nicID) 485 if err != nil { 486 return err 487 } 488 routes = append(routes, route) 489 } 490 491 if !args.Defaultv6Gateway.Route.Empty() { 492 nicID, ok := nicids[args.Defaultv6Gateway.Name] 493 if !ok { 494 return fmt.Errorf("invalid interface name %q for default route", args.Defaultv6Gateway.Name) 495 } 496 route, err := args.Defaultv6Gateway.Route.toTcpipRoute(nicID) 497 if err != nil { 498 return err 499 } 500 routes = append(routes, route) 501 } 502 503 log.Infof("Setting routes %+v", routes) 504 n.Stack.SetRouteTable(routes) 505 506 // Set NAT table rules if necessary. 507 if args.NATBlob { 508 log.Infof("Replacing NAT table") 509 iptReplaceBlob, err := io.ReadAll(args.FilePayload.Files[fdOffset]) 510 if err != nil { 511 return fmt.Errorf("failed to read iptables blob: %v", err) 512 } 513 fdOffset++ 514 if err := netfilter.SetEntries(n.Kernel.RootUserNamespace(), n.Stack, iptReplaceBlob, false); err != nil { 515 return fmt.Errorf("failed to SetEntries: %v", err) 516 } 517 } 518 519 return nil 520 } 521 522 // createNICWithAddrs creates a NIC in the network stack and adds the given 523 // addresses. 524 func (n *Network) createNICWithAddrs(id tcpip.NICID, ep stack.LinkEndpoint, opts stack.NICOptions, addrs []IPWithPrefix) error { 525 if err := n.Stack.CreateNICWithOptions(id, ep, opts); err != nil { 526 return fmt.Errorf("CreateNICWithOptions(%d, _, %+v) failed: %v", id, opts, err) 527 } 528 529 for _, addr := range addrs { 530 proto, tcpipAddr := ipToAddressAndProto(addr.Address) 531 protocolAddr := tcpip.ProtocolAddress{ 532 Protocol: proto, 533 AddressWithPrefix: tcpip.AddressWithPrefix{ 534 Address: tcpipAddr, 535 PrefixLen: addr.PrefixLen, 536 }, 537 } 538 if err := n.Stack.AddProtocolAddress(id, protocolAddr, stack.AddressProperties{}); err != nil { 539 return fmt.Errorf("AddProtocolAddress(%d, %+v, {}) failed: %s", id, protocolAddr, err) 540 } 541 } 542 return nil 543 } 544 545 // ipToAddressAndProto converts IP to tcpip.Address and a protocol number. 546 // 547 // Note: don't use 'len(ip)' to determine IP version because length is always 16. 548 func ipToAddressAndProto(ip net.IP) (tcpip.NetworkProtocolNumber, tcpip.Address) { 549 if i4 := ip.To4(); i4 != nil { 550 return ipv4.ProtocolNumber, tcpip.AddrFromSlice(i4) 551 } 552 return ipv6.ProtocolNumber, tcpip.AddrFromSlice(ip) 553 } 554 555 // ipToAddress converts IP to tcpip.Address, ignoring the protocol. 556 func ipToAddress(ip net.IP) tcpip.Address { 557 _, addr := ipToAddressAndProto(ip) 558 return addr 559 } 560 561 // ipMaskToAddressMask converts IPMask to tcpip.AddressMask, ignoring the 562 // protocol. 563 func ipMaskToAddressMask(ipMask net.IPMask) tcpip.AddressMask { 564 addr := ipToAddress(net.IP(ipMask)) 565 return tcpip.MaskFromBytes(addr.AsSlice()) 566 }