github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/runsc/boot/network.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package boot 16 17 import ( 18 "fmt" 19 "io" 20 "net" 21 "os" 22 "runtime" 23 "strings" 24 "time" 25 26 "golang.org/x/sys/unix" 27 "github.com/metacubex/gvisor/pkg/hostos" 28 "github.com/metacubex/gvisor/pkg/log" 29 "github.com/metacubex/gvisor/pkg/sentry/kernel" 30 "github.com/metacubex/gvisor/pkg/sentry/socket/netfilter" 31 "github.com/metacubex/gvisor/pkg/tcpip" 32 "github.com/metacubex/gvisor/pkg/tcpip/link/ethernet" 33 "github.com/metacubex/gvisor/pkg/tcpip/link/fdbased" 34 "github.com/metacubex/gvisor/pkg/tcpip/link/loopback" 35 "github.com/metacubex/gvisor/pkg/tcpip/link/qdisc/fifo" 36 "github.com/metacubex/gvisor/pkg/tcpip/link/sniffer" 37 "github.com/metacubex/gvisor/pkg/tcpip/link/xdp" 38 "github.com/metacubex/gvisor/pkg/tcpip/network/ipv4" 39 "github.com/metacubex/gvisor/pkg/tcpip/network/ipv6" 40 "github.com/metacubex/gvisor/pkg/tcpip/stack" 41 "github.com/metacubex/gvisor/pkg/urpc" 42 "github.com/metacubex/gvisor/runsc/config" 43 ) 44 45 var ( 46 // DefaultLoopbackLink contains IP addresses and routes of "127.0.0.1/8" and 47 // "::1/8" on "lo" interface. 48 DefaultLoopbackLink = LoopbackLink{ 49 Name: "lo", 50 Addresses: []IPWithPrefix{ 51 {Address: net.IP("\x7f\x00\x00\x01"), PrefixLen: 8}, 52 {Address: net.IPv6loopback, PrefixLen: 128}, 53 }, 54 Routes: []Route{ 55 { 56 Destination: net.IPNet{ 57 IP: net.IPv4(0x7f, 0, 0, 0), 58 Mask: net.IPv4Mask(0xff, 0, 0, 0), 59 }, 60 }, 61 { 62 Destination: net.IPNet{ 63 IP: net.IPv6loopback, 64 Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)), 65 }, 66 }, 67 }, 68 } 69 ) 70 71 // Network exposes methods that can be used to configure a network stack. 72 type Network struct { 73 Stack *stack.Stack 74 Kernel *kernel.Kernel 75 } 76 77 // Route represents a route in the network stack. 78 type Route struct { 79 Destination net.IPNet 80 Gateway net.IP 81 } 82 83 // DefaultRoute represents a catch all route to the default gateway. 84 type DefaultRoute struct { 85 Route Route 86 Name string 87 } 88 89 type Neighbor struct { 90 IP net.IP 91 HardwareAddr net.HardwareAddr 92 } 93 94 // FDBasedLink configures an fd-based link. 95 type FDBasedLink struct { 96 Name string 97 InterfaceIndex int 98 MTU int 99 Addresses []IPWithPrefix 100 Routes []Route 101 GSOMaxSize uint32 102 GvisorGSOEnabled bool 103 GvisorGROTimeout time.Duration 104 TXChecksumOffload bool 105 RXChecksumOffload bool 106 LinkAddress net.HardwareAddr 107 QDisc config.QueueingDiscipline 108 Neighbors []Neighbor 109 110 // NumChannels controls how many underlying FDs are to be used to 111 // create this endpoint. 112 NumChannels int 113 } 114 115 // BindOpt indicates whether the sentry or runsc process is responsible for 116 // binding the AF_XDP socket. 117 type BindOpt int 118 119 const ( 120 // BindSentry indicates the sentry process must call bind. 121 BindSentry BindOpt = iota 122 123 // BindRunsc indicates the runsc process must call bind. 124 BindRunsc 125 ) 126 127 // XDPLink configures an XDP link. 128 type XDPLink struct { 129 Name string 130 InterfaceIndex int 131 MTU int 132 Addresses []IPWithPrefix 133 Routes []Route 134 TXChecksumOffload bool 135 RXChecksumOffload bool 136 LinkAddress net.HardwareAddr 137 QDisc config.QueueingDiscipline 138 Neighbors []Neighbor 139 GvisorGROTimeout time.Duration 140 Bind BindOpt 141 142 // NumChannels controls how many underlying FDs are to be used to 143 // create this endpoint. 144 NumChannels int 145 } 146 147 // LoopbackLink configures a loopback link. 148 type LoopbackLink struct { 149 Name string 150 Addresses []IPWithPrefix 151 Routes []Route 152 GvisorGROTimeout time.Duration 153 } 154 155 // CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes. 156 type CreateLinksAndRoutesArgs struct { 157 // FilePayload contains the fds associated with the FDBasedLinks. The 158 // number of fd's should match the sum of the NumChannels field of the 159 // FDBasedLink entries below. 160 urpc.FilePayload 161 162 LoopbackLinks []LoopbackLink 163 FDBasedLinks []FDBasedLink 164 XDPLinks []XDPLink 165 166 Defaultv4Gateway DefaultRoute 167 Defaultv6Gateway DefaultRoute 168 169 // PCAP indicates that FilePayload also contains a PCAP log file. 170 PCAP bool 171 172 // LogPackets indicates that packets should be logged. 173 LogPackets bool 174 175 // NATBlob indicates whether FilePayload also contains an iptables NAT 176 // ruleset. 177 NATBlob bool 178 } 179 180 // IPWithPrefix is an address with its subnet prefix length. 181 type IPWithPrefix struct { 182 // Address is a network address. 183 Address net.IP 184 185 // PrefixLen is the subnet prefix length. 186 PrefixLen int 187 } 188 189 func (ip IPWithPrefix) String() string { 190 return fmt.Sprintf("%s/%d", ip.Address, ip.PrefixLen) 191 } 192 193 // Empty returns true if route hasn't been set. 194 func (r *Route) Empty() bool { 195 return r.Destination.IP == nil && r.Destination.Mask == nil && r.Gateway == nil 196 } 197 198 func (r *Route) toTcpipRoute(id tcpip.NICID) (tcpip.Route, error) { 199 subnet, err := tcpip.NewSubnet(ipToAddress(r.Destination.IP), ipMaskToAddressMask(r.Destination.Mask)) 200 if err != nil { 201 return tcpip.Route{}, err 202 } 203 return tcpip.Route{ 204 Destination: subnet, 205 Gateway: ipToAddress(r.Gateway), 206 NIC: id, 207 }, nil 208 } 209 210 // CreateLinksAndRoutes creates links and routes in a network stack. It should 211 // only be called once. 212 func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error { 213 if len(args.FDBasedLinks) > 0 && len(args.XDPLinks) > 0 { 214 return fmt.Errorf("received both fdbased and XDP links, but only one can be used at a time") 215 } 216 wantFDs := 0 217 for _, l := range args.FDBasedLinks { 218 wantFDs += l.NumChannels 219 } 220 for _, link := range args.XDPLinks { 221 // We have to keep several FDs alive when the sentry is 222 // responsible for binding, but when runsc binds we only expect 223 // the AF_XDP socket itself. 224 switch v := link.Bind; v { 225 case BindSentry: 226 wantFDs += 4 227 case BindRunsc: 228 wantFDs++ 229 default: 230 return fmt.Errorf("unknown bind value: %d", v) 231 } 232 } 233 if args.PCAP { 234 wantFDs++ 235 } 236 if args.NATBlob { 237 wantFDs++ 238 } 239 if got := len(args.FilePayload.Files); got != wantFDs { 240 return fmt.Errorf("args.FilePayload.Files has %d FDs but we need %d entries based on FDBasedLinks, XDPLinks, and PCAP", got, wantFDs) 241 } 242 243 var nicID tcpip.NICID 244 nicids := make(map[string]tcpip.NICID) 245 246 // Collect routes from all links. 247 var routes []tcpip.Route 248 249 // Loopback normally appear before other interfaces. 250 for _, link := range args.LoopbackLinks { 251 nicID++ 252 nicids[link.Name] = nicID 253 254 linkEP := ethernet.New(loopback.New()) 255 256 log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses) 257 opts := stack.NICOptions{ 258 Name: link.Name, 259 GROTimeout: link.GvisorGROTimeout, 260 DeliverLinkPackets: true, 261 } 262 if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil { 263 return err 264 } 265 266 // Collect the routes from this link. 267 for _, r := range link.Routes { 268 route, err := r.toTcpipRoute(nicID) 269 if err != nil { 270 return err 271 } 272 routes = append(routes, route) 273 } 274 } 275 276 // Setup fdbased or XDP links. 277 fdOffset := 0 278 if len(args.FDBasedLinks) > 0 { 279 // Choose a dispatch mode. 280 dispatchMode := fdbased.RecvMMsg 281 version, err := hostos.KernelVersion() 282 if err != nil { 283 return err 284 } 285 if version.AtLeast(5, 6) { 286 dispatchMode = fdbased.PacketMMap 287 } else { 288 log.Infof("Host kernel version < 5.6, falling back to RecvMMsg dispatch") 289 } 290 291 for _, link := range args.FDBasedLinks { 292 nicID++ 293 nicids[link.Name] = nicID 294 295 FDs := make([]int, 0, link.NumChannels) 296 for j := 0; j < link.NumChannels; j++ { 297 // Copy the underlying FD. 298 oldFD := args.FilePayload.Files[fdOffset].Fd() 299 newFD, err := unix.Dup(int(oldFD)) 300 if err != nil { 301 return fmt.Errorf("failed to dup FD %v: %v", oldFD, err) 302 } 303 FDs = append(FDs, newFD) 304 fdOffset++ 305 } 306 307 mac := tcpip.LinkAddress(link.LinkAddress) 308 log.Infof("gso max size is: %d", link.GSOMaxSize) 309 310 linkEP, err := fdbased.New(&fdbased.Options{ 311 FDs: FDs, 312 MTU: uint32(link.MTU), 313 EthernetHeader: mac != "", 314 Address: mac, 315 PacketDispatchMode: dispatchMode, 316 GSOMaxSize: link.GSOMaxSize, 317 GvisorGSOEnabled: link.GvisorGSOEnabled, 318 TXChecksumOffload: link.TXChecksumOffload, 319 RXChecksumOffload: link.RXChecksumOffload, 320 }) 321 if err != nil { 322 return err 323 } 324 325 // Setup packet logging if requested. 326 if args.PCAP { 327 newFD, err := unix.Dup(int(args.FilePayload.Files[fdOffset].Fd())) 328 if err != nil { 329 return fmt.Errorf("failed to dup pcap FD: %v", err) 330 } 331 const packetTruncateSize = 4096 332 linkEP, err = sniffer.NewWithWriter(linkEP, os.NewFile(uintptr(newFD), "pcap-file"), packetTruncateSize) 333 if err != nil { 334 return fmt.Errorf("failed to create PCAP logger: %v", err) 335 } 336 fdOffset++ 337 } else if args.LogPackets { 338 linkEP = sniffer.New(linkEP) 339 } 340 341 var qDisc stack.QueueingDiscipline 342 switch link.QDisc { 343 case config.QDiscNone: 344 case config.QDiscFIFO: 345 log.Infof("Enabling FIFO QDisc on %q", link.Name) 346 qDisc = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000) 347 } 348 349 log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) 350 opts := stack.NICOptions{ 351 Name: link.Name, 352 QDisc: qDisc, 353 GROTimeout: link.GvisorGROTimeout, 354 DeliverLinkPackets: true, 355 } 356 if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil { 357 return err 358 } 359 360 // Collect the routes from this link. 361 for _, r := range link.Routes { 362 route, err := r.toTcpipRoute(nicID) 363 if err != nil { 364 return err 365 } 366 routes = append(routes, route) 367 } 368 369 for _, neigh := range link.Neighbors { 370 proto, tcpipAddr := ipToAddressAndProto(neigh.IP) 371 n.Stack.AddStaticNeighbor(nicID, proto, tcpipAddr, tcpip.LinkAddress(neigh.HardwareAddr)) 372 } 373 } 374 } else if len(args.XDPLinks) > 0 { 375 if nlinks := len(args.XDPLinks); nlinks > 1 { 376 return fmt.Errorf("XDP only supports one link device, but got %d", nlinks) 377 } 378 link := args.XDPLinks[0] 379 nicID++ 380 nicids[link.Name] = nicID 381 382 // Get the AF_XDP socket. 383 oldFD := args.FilePayload.Files[fdOffset].Fd() 384 fd, err := unix.Dup(int(oldFD)) 385 if err != nil { 386 return fmt.Errorf("failed to dup AF_XDP fd %v: %v", oldFD, err) 387 } 388 fdOffset++ 389 390 // When the sentry is responsible for binding, the runsc 391 // process sends several other FDs in order to keep them open 392 // and alive. These are for BPF programs and maps that, if 393 // closed, will break the dispatcher. 394 if link.Bind == BindSentry { 395 for _, fdName := range []string{"program-fd", "sockmap-fd", "link-fd"} { 396 oldFD := args.FilePayload.Files[fdOffset].Fd() 397 if _, err := unix.Dup(int(oldFD)); err != nil { 398 return fmt.Errorf("failed to dup %s with FD %d: %v", fdName, oldFD, err) 399 } 400 fdOffset++ 401 } 402 } 403 404 // Setup packet logging if requested. 405 mac := tcpip.LinkAddress(link.LinkAddress) 406 linkEP, err := xdp.New(&xdp.Options{ 407 FD: fd, 408 Address: mac, 409 TXChecksumOffload: link.TXChecksumOffload, 410 RXChecksumOffload: link.RXChecksumOffload, 411 InterfaceIndex: link.InterfaceIndex, 412 Bind: link.Bind == BindSentry, 413 }) 414 if err != nil { 415 return err 416 } 417 418 if args.PCAP { 419 newFD, err := unix.Dup(int(args.FilePayload.Files[fdOffset].Fd())) 420 if err != nil { 421 return fmt.Errorf("failed to dup pcap FD: %v", err) 422 } 423 const packetTruncateSize = 4096 424 linkEP, err = sniffer.NewWithWriter(linkEP, os.NewFile(uintptr(newFD), "pcap-file"), packetTruncateSize) 425 if err != nil { 426 return fmt.Errorf("failed to create PCAP logger: %v", err) 427 } 428 fdOffset++ 429 } else if args.LogPackets { 430 linkEP = sniffer.New(linkEP) 431 } 432 433 var qDisc stack.QueueingDiscipline 434 switch link.QDisc { 435 case config.QDiscNone: 436 case config.QDiscFIFO: 437 log.Infof("Enabling FIFO QDisc on %q", link.Name) 438 qDisc = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000) 439 } 440 441 log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) 442 opts := stack.NICOptions{ 443 Name: link.Name, 444 QDisc: qDisc, 445 GROTimeout: link.GvisorGROTimeout, 446 DeliverLinkPackets: true, 447 } 448 if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil { 449 return err 450 } 451 452 // Collect the routes from this link. 453 for _, r := range link.Routes { 454 route, err := r.toTcpipRoute(nicID) 455 if err != nil { 456 return err 457 } 458 routes = append(routes, route) 459 } 460 461 for _, neigh := range link.Neighbors { 462 proto, tcpipAddr := ipToAddressAndProto(neigh.IP) 463 n.Stack.AddStaticNeighbor(nicID, proto, tcpipAddr, tcpip.LinkAddress(neigh.HardwareAddr)) 464 } 465 } 466 467 if !args.Defaultv4Gateway.Route.Empty() { 468 nicID, ok := nicids[args.Defaultv4Gateway.Name] 469 if !ok { 470 return fmt.Errorf("invalid interface name %q for default route", args.Defaultv4Gateway.Name) 471 } 472 route, err := args.Defaultv4Gateway.Route.toTcpipRoute(nicID) 473 if err != nil { 474 return err 475 } 476 routes = append(routes, route) 477 } 478 479 if !args.Defaultv6Gateway.Route.Empty() { 480 nicID, ok := nicids[args.Defaultv6Gateway.Name] 481 if !ok { 482 return fmt.Errorf("invalid interface name %q for default route", args.Defaultv6Gateway.Name) 483 } 484 route, err := args.Defaultv6Gateway.Route.toTcpipRoute(nicID) 485 if err != nil { 486 return err 487 } 488 routes = append(routes, route) 489 } 490 491 log.Infof("Setting routes %+v", routes) 492 n.Stack.SetRouteTable(routes) 493 494 // Set NAT table rules if necessary. 495 if args.NATBlob { 496 log.Infof("Replacing NAT table") 497 iptReplaceBlob, err := io.ReadAll(args.FilePayload.Files[fdOffset]) 498 if err != nil { 499 return fmt.Errorf("failed to read iptables blob: %v", err) 500 } 501 fdOffset++ 502 if err := netfilter.SetEntries(n.Kernel.RootUserNamespace(), n.Stack, iptReplaceBlob, false); err != nil { 503 return fmt.Errorf("failed to SetEntries: %v", err) 504 } 505 } 506 507 return nil 508 } 509 510 // createNICWithAddrs creates a NIC in the network stack and adds the given 511 // addresses. 512 func (n *Network) createNICWithAddrs(id tcpip.NICID, ep stack.LinkEndpoint, opts stack.NICOptions, addrs []IPWithPrefix) error { 513 if err := n.Stack.CreateNICWithOptions(id, ep, opts); err != nil { 514 return fmt.Errorf("CreateNICWithOptions(%d, _, %+v) failed: %v", id, opts, err) 515 } 516 517 for _, addr := range addrs { 518 proto, tcpipAddr := ipToAddressAndProto(addr.Address) 519 protocolAddr := tcpip.ProtocolAddress{ 520 Protocol: proto, 521 AddressWithPrefix: tcpip.AddressWithPrefix{ 522 Address: tcpipAddr, 523 PrefixLen: addr.PrefixLen, 524 }, 525 } 526 if err := n.Stack.AddProtocolAddress(id, protocolAddr, stack.AddressProperties{}); err != nil { 527 return fmt.Errorf("AddProtocolAddress(%d, %+v, {}) failed: %s", id, protocolAddr, err) 528 } 529 } 530 return nil 531 } 532 533 // ipToAddressAndProto converts IP to tcpip.Address and a protocol number. 534 // 535 // Note: don't use 'len(ip)' to determine IP version because length is always 16. 536 func ipToAddressAndProto(ip net.IP) (tcpip.NetworkProtocolNumber, tcpip.Address) { 537 if i4 := ip.To4(); i4 != nil { 538 return ipv4.ProtocolNumber, tcpip.AddrFromSlice(i4) 539 } 540 return ipv6.ProtocolNumber, tcpip.AddrFromSlice(ip) 541 } 542 543 // ipToAddress converts IP to tcpip.Address, ignoring the protocol. 544 func ipToAddress(ip net.IP) tcpip.Address { 545 _, addr := ipToAddressAndProto(ip) 546 return addr 547 } 548 549 // ipMaskToAddressMask converts IPMask to tcpip.AddressMask, ignoring the 550 // protocol. 551 func ipMaskToAddressMask(ipMask net.IPMask) tcpip.AddressMask { 552 addr := ipToAddress(net.IP(ipMask)) 553 return tcpip.MaskFromBytes(addr.AsSlice()) 554 }