github.com/kata-containers/runtime@v0.0.0-20210505125100-04f29832a923/virtcontainers/network.go (about) 1 // Copyright (c) 2016 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 6 package virtcontainers 7 8 import ( 9 "context" 10 cryptoRand "crypto/rand" 11 "encoding/json" 12 "fmt" 13 "math/rand" 14 "net" 15 "os" 16 "runtime" 17 "sort" 18 "time" 19 20 "github.com/containernetworking/plugins/pkg/ns" 21 "github.com/containernetworking/plugins/pkg/testutils" 22 opentracing "github.com/opentracing/opentracing-go" 23 "github.com/sirupsen/logrus" 24 "github.com/vishvananda/netlink" 25 "github.com/vishvananda/netns" 26 "golang.org/x/sys/unix" 27 28 "github.com/kata-containers/runtime/virtcontainers/pkg/rootless" 29 vcTypes "github.com/kata-containers/runtime/virtcontainers/pkg/types" 30 "github.com/kata-containers/runtime/virtcontainers/pkg/uuid" 31 "github.com/kata-containers/runtime/virtcontainers/utils" 32 ) 33 34 // NetInterworkingModel defines the network model connecting 35 // the network interface to the virtual machine. 36 type NetInterworkingModel int 37 38 const ( 39 // NetXConnectDefaultModel Ask to use DefaultNetInterworkingModel 40 NetXConnectDefaultModel NetInterworkingModel = iota 41 42 // NetXConnectMacVtapModel can be used when the Container network 43 // interface can be bridged using macvtap 44 NetXConnectMacVtapModel 45 46 // NetXConnectTCFilterModel redirects traffic from the network interface 47 // provided by the network plugin to a tap interface. 48 // This works for ipvlan and macvlan as well. 49 NetXConnectTCFilterModel 50 51 // NetXConnectNoneModel can be used when the VM is in the host network namespace 52 NetXConnectNoneModel 53 54 // NetXConnectInvalidModel is the last item to check valid values by IsValid() 55 NetXConnectInvalidModel 56 ) 57 58 //IsValid checks if a model is valid 59 func (n NetInterworkingModel) IsValid() bool { 60 return 0 <= int(n) && int(n) < int(NetXConnectInvalidModel) 61 } 62 63 const ( 64 defaultNetModelStr = "default" 65 66 macvtapNetModelStr = "macvtap" 67 68 tcFilterNetModelStr = "tcfilter" 69 70 noneNetModelStr = "none" 71 ) 72 73 //SetModel change the model string value 74 func (n *NetInterworkingModel) SetModel(modelName string) error { 75 switch modelName { 76 case defaultNetModelStr: 77 *n = DefaultNetInterworkingModel 78 return nil 79 case macvtapNetModelStr: 80 *n = NetXConnectMacVtapModel 81 return nil 82 case tcFilterNetModelStr: 83 *n = NetXConnectTCFilterModel 84 return nil 85 case noneNetModelStr: 86 *n = NetXConnectNoneModel 87 return nil 88 } 89 return fmt.Errorf("Unknown type %s", modelName) 90 } 91 92 // DefaultNetInterworkingModel is a package level default 93 // that determines how the VM should be connected to the 94 // the container network interface 95 var DefaultNetInterworkingModel = NetXConnectTCFilterModel 96 97 // Introduces constants related to networking 98 const ( 99 defaultFilePerms = 0600 100 defaultQlen = 1500 101 ) 102 103 // DNSInfo describes the DNS setup related to a network interface. 104 type DNSInfo struct { 105 Servers []string 106 Domain string 107 Searches []string 108 Options []string 109 } 110 111 // NetlinkIface describes fully a network interface. 112 type NetlinkIface struct { 113 netlink.LinkAttrs 114 Type string 115 } 116 117 // NetworkInfo gathers all information related to a network interface. 118 // It can be used to store the description of the underlying network. 119 type NetworkInfo struct { 120 Iface NetlinkIface 121 Addrs []netlink.Addr 122 Routes []netlink.Route 123 DNS DNSInfo 124 Neighbors []netlink.Neigh 125 } 126 127 // NetworkInterface defines a network interface. 128 type NetworkInterface struct { 129 Name string 130 HardAddr string 131 Addrs []netlink.Addr 132 } 133 134 // TapInterface defines a tap interface 135 type TapInterface struct { 136 ID string 137 Name string 138 TAPIface NetworkInterface 139 VMFds []*os.File 140 VhostFds []*os.File 141 } 142 143 // TuntapInterface defines a tap interface 144 type TuntapInterface struct { 145 Name string 146 TAPIface NetworkInterface 147 } 148 149 // NetworkInterfacePair defines a pair between VM and virtual network interfaces. 150 type NetworkInterfacePair struct { 151 TapInterface 152 VirtIface NetworkInterface 153 NetInterworkingModel 154 } 155 156 // NetworkConfig is the network configuration related to a network. 157 type NetworkConfig struct { 158 NetNSPath string 159 NetNsCreated bool 160 DisableNewNetNs bool 161 NetmonConfig NetmonConfig 162 InterworkingModel NetInterworkingModel 163 } 164 165 func networkLogger() *logrus.Entry { 166 return virtLog.WithField("subsystem", "network") 167 } 168 169 // NetworkNamespace contains all data related to its network namespace. 170 type NetworkNamespace struct { 171 NetNsPath string 172 NetNsCreated bool 173 Endpoints []Endpoint 174 NetmonPID int 175 } 176 177 // TypedJSONEndpoint is used as an intermediate representation for 178 // marshalling and unmarshalling Endpoint objects. 179 type TypedJSONEndpoint struct { 180 Type EndpointType 181 Data json.RawMessage 182 } 183 184 // MarshalJSON is the custom NetworkNamespace JSON marshalling routine. 185 // This is needed to properly marshall Endpoints array. 186 func (n NetworkNamespace) MarshalJSON() ([]byte, error) { 187 // We need a shadow structure in order to prevent json from 188 // entering a recursive loop when only calling json.Marshal(). 189 type shadow struct { 190 NetNsPath string 191 NetNsCreated bool 192 Endpoints []TypedJSONEndpoint 193 } 194 195 s := &shadow{ 196 NetNsPath: n.NetNsPath, 197 NetNsCreated: n.NetNsCreated, 198 } 199 200 var typedEndpoints []TypedJSONEndpoint 201 for _, endpoint := range n.Endpoints { 202 tempJSON, _ := json.Marshal(endpoint) 203 204 t := TypedJSONEndpoint{ 205 Type: endpoint.Type(), 206 Data: tempJSON, 207 } 208 209 typedEndpoints = append(typedEndpoints, t) 210 } 211 212 s.Endpoints = typedEndpoints 213 214 b, err := json.Marshal(s) 215 return b, err 216 } 217 218 func generateEndpoints(typedEndpoints []TypedJSONEndpoint) ([]Endpoint, error) { 219 var endpoints []Endpoint 220 221 for _, e := range typedEndpoints { 222 var endpointInf Endpoint 223 switch e.Type { 224 case PhysicalEndpointType: 225 var endpoint PhysicalEndpoint 226 endpointInf = &endpoint 227 228 case VethEndpointType: 229 var endpoint VethEndpoint 230 endpointInf = &endpoint 231 232 case VhostUserEndpointType: 233 var endpoint VhostUserEndpoint 234 endpointInf = &endpoint 235 236 case BridgedMacvlanEndpointType: 237 var endpoint BridgedMacvlanEndpoint 238 endpointInf = &endpoint 239 240 case MacvtapEndpointType: 241 var endpoint MacvtapEndpoint 242 endpointInf = &endpoint 243 244 case TapEndpointType: 245 var endpoint TapEndpoint 246 endpointInf = &endpoint 247 248 case IPVlanEndpointType: 249 var endpoint IPVlanEndpoint 250 endpointInf = &endpoint 251 252 case TuntapEndpointType: 253 var endpoint TuntapEndpoint 254 endpointInf = &endpoint 255 256 default: 257 networkLogger().WithField("endpoint-type", e.Type).Error("Ignoring unknown endpoint type") 258 } 259 260 err := json.Unmarshal(e.Data, endpointInf) 261 if err != nil { 262 return nil, err 263 } 264 265 endpoints = append(endpoints, endpointInf) 266 networkLogger().WithFields(logrus.Fields{ 267 "endpoint": endpointInf, 268 "endpoint-type": e.Type, 269 }).Info("endpoint unmarshalled") 270 } 271 return endpoints, nil 272 } 273 274 // UnmarshalJSON is the custom NetworkNamespace unmarshalling routine. 275 // This is needed for unmarshalling the Endpoints interfaces array. 276 func (n *NetworkNamespace) UnmarshalJSON(b []byte) error { 277 var s struct { 278 NetNsPath string 279 NetNsCreated bool 280 Endpoints json.RawMessage 281 } 282 283 if err := json.Unmarshal(b, &s); err != nil { 284 return err 285 } 286 287 (*n).NetNsPath = s.NetNsPath 288 (*n).NetNsCreated = s.NetNsCreated 289 290 var typedEndpoints []TypedJSONEndpoint 291 if err := json.Unmarshal([]byte(string(s.Endpoints)), &typedEndpoints); err != nil { 292 return err 293 } 294 endpoints, err := generateEndpoints(typedEndpoints) 295 if err != nil { 296 return err 297 } 298 299 (*n).Endpoints = endpoints 300 return nil 301 } 302 303 func createLink(netHandle *netlink.Handle, name string, expectedLink netlink.Link, queues int) (netlink.Link, []*os.File, error) { 304 var newLink netlink.Link 305 var fds []*os.File 306 307 switch expectedLink.Type() { 308 case (&netlink.Tuntap{}).Type(): 309 flags := netlink.TUNTAP_VNET_HDR 310 if queues > 0 { 311 flags |= netlink.TUNTAP_MULTI_QUEUE_DEFAULTS 312 } 313 newLink = &netlink.Tuntap{ 314 LinkAttrs: netlink.LinkAttrs{Name: name}, 315 Mode: netlink.TUNTAP_MODE_TAP, 316 Queues: queues, 317 Flags: flags, 318 } 319 case (&netlink.Macvtap{}).Type(): 320 qlen := expectedLink.Attrs().TxQLen 321 if qlen <= 0 { 322 qlen = defaultQlen 323 } 324 newLink = &netlink.Macvtap{ 325 Macvlan: netlink.Macvlan{ 326 Mode: netlink.MACVLAN_MODE_BRIDGE, 327 LinkAttrs: netlink.LinkAttrs{ 328 Index: expectedLink.Attrs().Index, 329 Name: name, 330 TxQLen: qlen, 331 ParentIndex: expectedLink.Attrs().ParentIndex, 332 }, 333 }, 334 } 335 default: 336 return nil, fds, fmt.Errorf("Unsupported link type %s", expectedLink.Type()) 337 } 338 339 if err := netHandle.LinkAdd(newLink); err != nil { 340 return nil, fds, fmt.Errorf("LinkAdd() failed for %s name %s: %s", expectedLink.Type(), name, err) 341 } 342 343 tuntapLink, ok := newLink.(*netlink.Tuntap) 344 if ok { 345 fds = tuntapLink.Fds 346 } 347 348 newLink, err := getLinkByName(netHandle, name, expectedLink) 349 return newLink, fds, err 350 } 351 352 func getLinkForEndpoint(endpoint Endpoint, netHandle *netlink.Handle) (netlink.Link, error) { 353 var link netlink.Link 354 355 switch ep := endpoint.(type) { 356 case *VethEndpoint: 357 link = &netlink.Veth{} 358 case *BridgedMacvlanEndpoint: 359 link = &netlink.Macvlan{} 360 case *IPVlanEndpoint: 361 link = &netlink.IPVlan{} 362 case *TuntapEndpoint: 363 link = &netlink.Tuntap{} 364 default: 365 return nil, fmt.Errorf("Unexpected endpointType %s", ep.Type()) 366 } 367 368 return getLinkByName(netHandle, endpoint.NetworkPair().VirtIface.Name, link) 369 } 370 371 func getLinkByName(netHandle *netlink.Handle, name string, expectedLink netlink.Link) (netlink.Link, error) { 372 link, err := netHandle.LinkByName(name) 373 if err != nil { 374 return nil, fmt.Errorf("LinkByName() failed for %s name %s: %s", expectedLink.Type(), name, err) 375 } 376 377 switch expectedLink.Type() { 378 case (&netlink.Tuntap{}).Type(): 379 if l, ok := link.(*netlink.Tuntap); ok { 380 return l, nil 381 } 382 case (&netlink.Veth{}).Type(): 383 if l, ok := link.(*netlink.Veth); ok { 384 return l, nil 385 } 386 case (&netlink.Macvtap{}).Type(): 387 if l, ok := link.(*netlink.Macvtap); ok { 388 return l, nil 389 } 390 case (&netlink.Macvlan{}).Type(): 391 if l, ok := link.(*netlink.Macvlan); ok { 392 return l, nil 393 } 394 case (&netlink.IPVlan{}).Type(): 395 if l, ok := link.(*netlink.IPVlan); ok { 396 return l, nil 397 } 398 default: 399 return nil, fmt.Errorf("Unsupported link type %s", expectedLink.Type()) 400 } 401 402 return nil, fmt.Errorf("Incorrect link type %s, expecting %s", link.Type(), expectedLink.Type()) 403 } 404 405 // The endpoint type should dictate how the connection needs to happen. 406 func xConnectVMNetwork(endpoint Endpoint, h hypervisor) error { 407 netPair := endpoint.NetworkPair() 408 409 queues := 0 410 caps := h.capabilities() 411 if caps.IsMultiQueueSupported() { 412 queues = int(h.hypervisorConfig().NumVCPUs) 413 } 414 415 var disableVhostNet bool 416 if rootless.IsRootless() { 417 disableVhostNet = true 418 } else { 419 disableVhostNet = h.hypervisorConfig().DisableVhostNet 420 } 421 422 if netPair.NetInterworkingModel == NetXConnectDefaultModel { 423 netPair.NetInterworkingModel = DefaultNetInterworkingModel 424 } 425 426 switch netPair.NetInterworkingModel { 427 case NetXConnectMacVtapModel: 428 return tapNetworkPair(endpoint, queues, disableVhostNet) 429 case NetXConnectTCFilterModel: 430 return setupTCFiltering(endpoint, queues, disableVhostNet) 431 default: 432 return fmt.Errorf("Invalid internetworking model") 433 } 434 } 435 436 // The endpoint type should dictate how the disconnection needs to happen. 437 func xDisconnectVMNetwork(endpoint Endpoint) error { 438 netPair := endpoint.NetworkPair() 439 440 if netPair.NetInterworkingModel == NetXConnectDefaultModel { 441 netPair.NetInterworkingModel = DefaultNetInterworkingModel 442 } 443 444 switch netPair.NetInterworkingModel { 445 case NetXConnectMacVtapModel: 446 return untapNetworkPair(endpoint) 447 case NetXConnectTCFilterModel: 448 return removeTCFiltering(endpoint) 449 default: 450 return fmt.Errorf("Invalid internetworking model") 451 } 452 } 453 454 func createMacvtapFds(linkIndex int, queues int) ([]*os.File, error) { 455 tapDev := fmt.Sprintf("/dev/tap%d", linkIndex) 456 return createFds(tapDev, queues) 457 } 458 459 func createVhostFds(numFds int) ([]*os.File, error) { 460 vhostDev := "/dev/vhost-net" 461 return createFds(vhostDev, numFds) 462 } 463 464 func createFds(device string, numFds int) ([]*os.File, error) { 465 fds := make([]*os.File, numFds) 466 467 for i := 0; i < numFds; i++ { 468 f, err := os.OpenFile(device, os.O_RDWR, defaultFilePerms) 469 if err != nil { 470 utils.CleanupFds(fds, i) 471 return nil, err 472 } 473 fds[i] = f 474 } 475 return fds, nil 476 } 477 478 // There is a limitation in the linux kernel that prevents a macvtap/macvlan link 479 // from getting the correct link index when created in a network namespace 480 // https://github.com/clearcontainers/runtime/issues/708 481 // 482 // Till that bug is fixed we need to pick a random non conflicting index and try to 483 // create a link. If that fails, we need to try with another. 484 // All the kernel does not check if the link id conflicts with a link id on the host 485 // hence we need to offset the link id to prevent any overlaps with the host index 486 // 487 // Here the kernel will ensure that there is no race condition 488 489 const hostLinkOffset = 8192 // Host should not have more than 8k interfaces 490 const linkRange = 0xFFFF // This will allow upto 2^16 containers 491 const linkRetries = 128 // The numbers of time we try to find a non conflicting index 492 const macvtapWorkaround = true 493 494 func createMacVtap(netHandle *netlink.Handle, name string, link netlink.Link, queues int) (taplink netlink.Link, err error) { 495 496 if !macvtapWorkaround { 497 taplink, _, err = createLink(netHandle, name, link, queues) 498 return 499 } 500 501 r := rand.New(rand.NewSource(time.Now().UnixNano())) 502 503 for i := 0; i < linkRetries; i++ { 504 index := hostLinkOffset + (r.Int() & linkRange) 505 link.Attrs().Index = index 506 taplink, _, err = createLink(netHandle, name, link, queues) 507 if err == nil { 508 break 509 } 510 } 511 512 return 513 } 514 515 func clearIPs(link netlink.Link, addrs []netlink.Addr) error { 516 for _, addr := range addrs { 517 if err := netlink.AddrDel(link, &addr); err != nil { 518 return err 519 } 520 } 521 return nil 522 } 523 524 func setIPs(link netlink.Link, addrs []netlink.Addr) error { 525 for _, addr := range addrs { 526 if err := netlink.AddrAdd(link, &addr); err != nil { 527 return err 528 } 529 } 530 return nil 531 } 532 533 func tapNetworkPair(endpoint Endpoint, queues int, disableVhostNet bool) error { 534 netHandle, err := netlink.NewHandle() 535 if err != nil { 536 return err 537 } 538 defer netHandle.Delete() 539 540 netPair := endpoint.NetworkPair() 541 542 link, err := getLinkForEndpoint(endpoint, netHandle) 543 if err != nil { 544 return err 545 } 546 547 attrs := link.Attrs() 548 549 // Attach the macvtap interface to the underlying container 550 // interface. Also picks relevant attributes from the parent 551 tapLink, err := createMacVtap(netHandle, netPair.TAPIface.Name, 552 &netlink.Macvtap{ 553 Macvlan: netlink.Macvlan{ 554 LinkAttrs: netlink.LinkAttrs{ 555 TxQLen: attrs.TxQLen, 556 ParentIndex: attrs.Index, 557 }, 558 }, 559 }, queues) 560 561 if err != nil { 562 return fmt.Errorf("Could not create TAP interface: %s", err) 563 } 564 565 // Save the veth MAC address to the TAP so that it can later be used 566 // to build the hypervisor command line. This MAC address has to be 567 // the one inside the VM in order to avoid any firewall issues. The 568 // bridge created by the network plugin on the host actually expects 569 // to see traffic from this MAC address and not another one. 570 tapHardAddr := attrs.HardwareAddr 571 netPair.TAPIface.HardAddr = attrs.HardwareAddr.String() 572 573 if err := netHandle.LinkSetMTU(tapLink, attrs.MTU); err != nil { 574 return fmt.Errorf("Could not set TAP MTU %d: %s", attrs.MTU, err) 575 } 576 577 hardAddr, err := net.ParseMAC(netPair.VirtIface.HardAddr) 578 if err != nil { 579 return err 580 } 581 if err := netHandle.LinkSetHardwareAddr(link, hardAddr); err != nil { 582 return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s", 583 netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err) 584 } 585 586 if err := netHandle.LinkSetHardwareAddr(tapLink, tapHardAddr); err != nil { 587 return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s", 588 netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err) 589 } 590 591 if err := netHandle.LinkSetUp(tapLink); err != nil { 592 return fmt.Errorf("Could not enable TAP %s: %s", netPair.TAPIface.Name, err) 593 } 594 595 // Clear the IP addresses from the veth interface to prevent ARP conflict 596 netPair.VirtIface.Addrs, err = netlink.AddrList(link, netlink.FAMILY_ALL) 597 if err != nil { 598 return fmt.Errorf("Unable to obtain veth IP addresses: %s", err) 599 } 600 601 if err := clearIPs(link, netPair.VirtIface.Addrs); err != nil { 602 return fmt.Errorf("Unable to clear veth IP addresses: %s", err) 603 } 604 605 if err := netHandle.LinkSetUp(link); err != nil { 606 return fmt.Errorf("Could not enable veth %s: %s", netPair.VirtIface.Name, err) 607 } 608 609 // Note: The underlying interfaces need to be up prior to fd creation. 610 611 netPair.VMFds, err = createMacvtapFds(tapLink.Attrs().Index, queues) 612 if err != nil { 613 return fmt.Errorf("Could not setup macvtap fds %s: %s", netPair.TAPIface, err) 614 } 615 616 if !disableVhostNet { 617 vhostFds, err := createVhostFds(queues) 618 if err != nil { 619 return fmt.Errorf("Could not setup vhost fds %s : %s", netPair.VirtIface.Name, err) 620 } 621 netPair.VhostFds = vhostFds 622 } 623 624 return nil 625 } 626 627 func setupTCFiltering(endpoint Endpoint, queues int, disableVhostNet bool) error { 628 netHandle, err := netlink.NewHandle() 629 if err != nil { 630 return err 631 } 632 defer netHandle.Delete() 633 634 netPair := endpoint.NetworkPair() 635 636 tapLink, fds, err := createLink(netHandle, netPair.TAPIface.Name, &netlink.Tuntap{}, queues) 637 if err != nil { 638 return fmt.Errorf("Could not create TAP interface: %s", err) 639 } 640 netPair.VMFds = fds 641 642 if !disableVhostNet { 643 vhostFds, err := createVhostFds(queues) 644 if err != nil { 645 return fmt.Errorf("Could not setup vhost fds %s : %s", netPair.VirtIface.Name, err) 646 } 647 netPair.VhostFds = vhostFds 648 } 649 650 var attrs *netlink.LinkAttrs 651 var link netlink.Link 652 653 link, err = getLinkForEndpoint(endpoint, netHandle) 654 if err != nil { 655 return err 656 } 657 658 attrs = link.Attrs() 659 660 // Save the veth MAC address to the TAP so that it can later be used 661 // to build the hypervisor command line. This MAC address has to be 662 // the one inside the VM in order to avoid any firewall issues. The 663 // bridge created by the network plugin on the host actually expects 664 // to see traffic from this MAC address and not another one. 665 netPair.TAPIface.HardAddr = attrs.HardwareAddr.String() 666 667 if err := netHandle.LinkSetMTU(tapLink, attrs.MTU); err != nil { 668 return fmt.Errorf("Could not set TAP MTU %d: %s", attrs.MTU, err) 669 } 670 671 if err := netHandle.LinkSetUp(tapLink); err != nil { 672 return fmt.Errorf("Could not enable TAP %s: %s", netPair.TAPIface.Name, err) 673 } 674 675 tapAttrs := tapLink.Attrs() 676 677 if err := addQdiscIngress(tapAttrs.Index); err != nil { 678 return err 679 } 680 681 if err := addQdiscIngress(attrs.Index); err != nil { 682 return err 683 } 684 685 if err := addRedirectTCFilter(attrs.Index, tapAttrs.Index); err != nil { 686 return err 687 } 688 689 if err := addRedirectTCFilter(tapAttrs.Index, attrs.Index); err != nil { 690 return err 691 } 692 693 return nil 694 } 695 696 // addQdiscIngress creates a new qdisc for nwtwork interface with the specified network index 697 // on "ingress". qdiscs normally don't work on ingress so this is really a special qdisc 698 // that you can consider an "alternate root" for inbound packets. 699 // Handle for ingress qdisc defaults to "ffff:" 700 // 701 // This is equivalent to calling `tc qdisc add dev eth0 ingress` 702 func addQdiscIngress(index int) error { 703 qdisc := &netlink.Ingress{ 704 QdiscAttrs: netlink.QdiscAttrs{ 705 LinkIndex: index, 706 Parent: netlink.HANDLE_INGRESS, 707 }, 708 } 709 710 err := netlink.QdiscAdd(qdisc) 711 if err != nil { 712 return fmt.Errorf("Failed to add qdisc for network index %d : %s", index, err) 713 } 714 715 return nil 716 } 717 718 // addRedirectTCFilter adds a tc filter for device with index "sourceIndex". 719 // All traffic for interface with index "sourceIndex" is redirected to interface with 720 // index "destIndex" 721 // 722 // This is equivalent to calling: 723 // `tc filter add dev source parent ffff: protocol all u32 match u8 0 0 action mirred egress redirect dev dest` 724 func addRedirectTCFilter(sourceIndex, destIndex int) error { 725 filter := &netlink.U32{ 726 FilterAttrs: netlink.FilterAttrs{ 727 LinkIndex: sourceIndex, 728 Parent: netlink.MakeHandle(0xffff, 0), 729 Protocol: unix.ETH_P_ALL, 730 }, 731 Actions: []netlink.Action{ 732 &netlink.MirredAction{ 733 ActionAttrs: netlink.ActionAttrs{ 734 Action: netlink.TC_ACT_STOLEN, 735 }, 736 MirredAction: netlink.TCA_EGRESS_REDIR, 737 Ifindex: destIndex, 738 }, 739 }, 740 } 741 742 if err := netlink.FilterAdd(filter); err != nil { 743 return fmt.Errorf("Failed to add filter for index %d : %s", sourceIndex, err) 744 } 745 746 return nil 747 } 748 749 // removeRedirectTCFilter removes all tc u32 filters created on ingress qdisc for "link". 750 func removeRedirectTCFilter(link netlink.Link) error { 751 if link == nil { 752 return nil 753 } 754 755 // Handle 0xffff is used for ingress 756 filters, err := netlink.FilterList(link, netlink.MakeHandle(0xffff, 0)) 757 if err != nil { 758 return err 759 } 760 761 for _, f := range filters { 762 u32, ok := f.(*netlink.U32) 763 764 if !ok { 765 continue 766 } 767 768 if err := netlink.FilterDel(u32); err != nil { 769 return err 770 } 771 } 772 return nil 773 } 774 775 // removeQdiscIngress removes the ingress qdisc previously created on "link". 776 func removeQdiscIngress(link netlink.Link) error { 777 if link == nil { 778 return nil 779 } 780 781 qdiscs, err := netlink.QdiscList(link) 782 if err != nil { 783 return err 784 } 785 786 for _, qdisc := range qdiscs { 787 ingress, ok := qdisc.(*netlink.Ingress) 788 if !ok { 789 continue 790 } 791 792 if err := netlink.QdiscDel(ingress); err != nil { 793 return err 794 } 795 } 796 return nil 797 } 798 799 func untapNetworkPair(endpoint Endpoint) error { 800 netHandle, err := netlink.NewHandle() 801 if err != nil { 802 return err 803 } 804 defer netHandle.Delete() 805 806 netPair := endpoint.NetworkPair() 807 808 tapLink, err := getLinkByName(netHandle, netPair.TAPIface.Name, &netlink.Macvtap{}) 809 if err != nil { 810 return fmt.Errorf("Could not get TAP interface %s: %s", netPair.TAPIface.Name, err) 811 } 812 813 if err := netHandle.LinkDel(tapLink); err != nil { 814 return fmt.Errorf("Could not remove TAP %s: %s", netPair.TAPIface.Name, err) 815 } 816 817 link, err := getLinkForEndpoint(endpoint, netHandle) 818 if err != nil { 819 return err 820 } 821 822 hardAddr, err := net.ParseMAC(netPair.TAPIface.HardAddr) 823 if err != nil { 824 return err 825 } 826 if err := netHandle.LinkSetHardwareAddr(link, hardAddr); err != nil { 827 return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s", 828 netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err) 829 } 830 831 if err := netHandle.LinkSetDown(link); err != nil { 832 return fmt.Errorf("Could not disable veth %s: %s", netPair.VirtIface.Name, err) 833 } 834 835 // Restore the IPs that were cleared 836 err = setIPs(link, netPair.VirtIface.Addrs) 837 return err 838 } 839 840 func removeTCFiltering(endpoint Endpoint) error { 841 netHandle, err := netlink.NewHandle() 842 if err != nil { 843 return err 844 } 845 defer netHandle.Delete() 846 847 netPair := endpoint.NetworkPair() 848 849 tapLink, err := getLinkByName(netHandle, netPair.TAPIface.Name, &netlink.Tuntap{}) 850 if err != nil { 851 return fmt.Errorf("Could not get TAP interface: %s", err) 852 } 853 854 if err := netHandle.LinkSetDown(tapLink); err != nil { 855 return fmt.Errorf("Could not disable TAP %s: %s", netPair.TAPIface.Name, err) 856 } 857 858 if err := netHandle.LinkDel(tapLink); err != nil { 859 return fmt.Errorf("Could not remove TAP %s: %s", netPair.TAPIface.Name, err) 860 } 861 862 link, err := getLinkForEndpoint(endpoint, netHandle) 863 if err != nil { 864 return err 865 } 866 867 if err := removeRedirectTCFilter(link); err != nil { 868 return err 869 } 870 871 if err := removeQdiscIngress(link); err != nil { 872 return err 873 } 874 875 if err := netHandle.LinkSetDown(link); err != nil { 876 return fmt.Errorf("Could not disable veth %s: %s", netPair.VirtIface.Name, err) 877 } 878 879 return nil 880 } 881 882 func createNetNS() (string, error) { 883 n, err := testutils.NewNS() 884 if err != nil { 885 return "", err 886 } 887 888 return n.Path(), nil 889 } 890 891 // doNetNS is free from any call to a go routine, and it calls 892 // into runtime.LockOSThread(), meaning it won't be executed in a 893 // different thread than the one expected by the caller. 894 func doNetNS(netNSPath string, cb func(ns.NetNS) error) error { 895 // if netNSPath is empty, the callback function will be run in the current network namespace. 896 // So skip the whole function, just call cb(). cb() needs a NetNS as arg but ignored, give it a fake one. 897 if netNSPath == "" { 898 var netNs ns.NetNS 899 return cb(netNs) 900 } 901 902 runtime.LockOSThread() 903 defer runtime.UnlockOSThread() 904 905 currentNS, err := ns.GetCurrentNS() 906 if err != nil { 907 return err 908 } 909 defer currentNS.Close() 910 911 targetNS, err := ns.GetNS(netNSPath) 912 if err != nil { 913 return err 914 } 915 916 if err := targetNS.Set(); err != nil { 917 return err 918 } 919 defer currentNS.Set() 920 921 return cb(targetNS) 922 } 923 924 func deleteNetNS(netNSPath string) error { 925 n, err := ns.GetNS(netNSPath) 926 if err != nil { 927 return err 928 } 929 930 err = n.Close() 931 if err != nil { 932 return err 933 } 934 935 if err = unix.Unmount(netNSPath, unix.MNT_DETACH); err != nil { 936 return fmt.Errorf("Failed to unmount namespace %s: %v", netNSPath, err) 937 } 938 if err := os.RemoveAll(netNSPath); err != nil { 939 return fmt.Errorf("Failed to clean up namespace %s: %v", netNSPath, err) 940 } 941 942 return nil 943 } 944 945 func generateVCNetworkStructures(networkNS NetworkNamespace) ([]*vcTypes.Interface, []*vcTypes.Route, []*vcTypes.ARPNeighbor, error) { 946 947 if networkNS.NetNsPath == "" { 948 return nil, nil, nil, nil 949 } 950 951 var routes []*vcTypes.Route 952 var ifaces []*vcTypes.Interface 953 var neighs []*vcTypes.ARPNeighbor 954 955 for _, endpoint := range networkNS.Endpoints { 956 957 var ipAddresses []*vcTypes.IPAddress 958 for _, addr := range endpoint.Properties().Addrs { 959 // Skip localhost interface 960 if addr.IP.IsLoopback() { 961 continue 962 } 963 964 netMask, _ := addr.Mask.Size() 965 ipAddress := vcTypes.IPAddress{ 966 Family: netlink.FAMILY_V4, 967 Address: addr.IP.String(), 968 Mask: fmt.Sprintf("%d", netMask), 969 } 970 971 if addr.IP.To4() == nil { 972 ipAddress.Family = netlink.FAMILY_V6 973 } 974 ipAddresses = append(ipAddresses, &ipAddress) 975 } 976 noarp := endpoint.Properties().Iface.RawFlags & unix.IFF_NOARP 977 ifc := vcTypes.Interface{ 978 IPAddresses: ipAddresses, 979 Device: endpoint.Name(), 980 Name: endpoint.Name(), 981 Mtu: uint64(endpoint.Properties().Iface.MTU), 982 RawFlags: noarp, 983 HwAddr: endpoint.HardwareAddr(), 984 PciPath: endpoint.PciPath(), 985 } 986 987 ifaces = append(ifaces, &ifc) 988 989 for _, route := range endpoint.Properties().Routes { 990 var r vcTypes.Route 991 992 if route.Protocol == unix.RTPROT_KERNEL { 993 continue 994 } 995 996 if route.Dst != nil { 997 r.Dest = route.Dst.String() 998 } 999 1000 if route.Gw != nil { 1001 gateway := route.Gw.String() 1002 r.Gateway = gateway 1003 } 1004 1005 if route.Src != nil { 1006 r.Source = route.Src.String() 1007 } 1008 1009 r.Device = endpoint.Name() 1010 r.Scope = uint32(route.Scope) 1011 routes = append(routes, &r) 1012 } 1013 1014 for _, neigh := range endpoint.Properties().Neighbors { 1015 var n vcTypes.ARPNeighbor 1016 1017 // We add only static ARP entries 1018 if neigh.State != netlink.NUD_PERMANENT { 1019 continue 1020 } 1021 1022 n.Device = endpoint.Name() 1023 n.State = neigh.State 1024 n.Flags = neigh.Flags 1025 1026 if neigh.HardwareAddr != nil { 1027 n.LLAddr = neigh.HardwareAddr.String() 1028 } 1029 1030 n.ToIPAddress = &vcTypes.IPAddress{ 1031 Family: netlink.FAMILY_V4, 1032 Address: neigh.IP.String(), 1033 } 1034 if neigh.IP.To4() == nil { 1035 n.ToIPAddress.Family = netlink.FAMILY_V6 1036 } 1037 1038 neighs = append(neighs, &n) 1039 } 1040 } 1041 return ifaces, routes, neighs, nil 1042 } 1043 1044 func createNetworkInterfacePair(idx int, ifName string, interworkingModel NetInterworkingModel) (NetworkInterfacePair, error) { 1045 uniqueID := uuid.Generate().String() 1046 1047 randomMacAddr, err := generateRandomPrivateMacAddr() 1048 if err != nil { 1049 return NetworkInterfacePair{}, fmt.Errorf("Could not generate random mac address: %s", err) 1050 } 1051 1052 netPair := NetworkInterfacePair{ 1053 TapInterface: TapInterface{ 1054 ID: uniqueID, 1055 Name: fmt.Sprintf("br%d_kata", idx), 1056 TAPIface: NetworkInterface{ 1057 Name: fmt.Sprintf("tap%d_kata", idx), 1058 }, 1059 }, 1060 VirtIface: NetworkInterface{ 1061 Name: fmt.Sprintf("eth%d", idx), 1062 HardAddr: randomMacAddr, 1063 }, 1064 NetInterworkingModel: interworkingModel, 1065 } 1066 1067 if ifName != "" { 1068 netPair.VirtIface.Name = ifName 1069 } 1070 1071 return netPair, nil 1072 } 1073 1074 func generateRandomPrivateMacAddr() (string, error) { 1075 buf := make([]byte, 6) 1076 _, err := cryptoRand.Read(buf) 1077 if err != nil { 1078 return "", err 1079 } 1080 1081 // Set the local bit for local addresses 1082 // Addresses in this range are local mac addresses: 1083 // x2-xx-xx-xx-xx-xx , x6-xx-xx-xx-xx-xx , xA-xx-xx-xx-xx-xx , xE-xx-xx-xx-xx-xx 1084 buf[0] = (buf[0] | 2) & 0xfe 1085 1086 hardAddr := net.HardwareAddr(buf) 1087 return hardAddr.String(), nil 1088 } 1089 1090 func networkInfoFromLink(handle *netlink.Handle, link netlink.Link) (NetworkInfo, error) { 1091 addrs, err := handle.AddrList(link, netlink.FAMILY_ALL) 1092 if err != nil { 1093 return NetworkInfo{}, err 1094 } 1095 1096 routes, err := handle.RouteList(link, netlink.FAMILY_ALL) 1097 if err != nil { 1098 return NetworkInfo{}, err 1099 } 1100 1101 neighbors, err := handle.NeighList(link.Attrs().Index, netlink.FAMILY_ALL) 1102 if err != nil { 1103 return NetworkInfo{}, err 1104 } 1105 1106 return NetworkInfo{ 1107 Iface: NetlinkIface{ 1108 LinkAttrs: *(link.Attrs()), 1109 Type: link.Type(), 1110 }, 1111 Addrs: addrs, 1112 Routes: routes, 1113 Neighbors: neighbors, 1114 }, nil 1115 } 1116 1117 func createEndpointsFromScan(networkNSPath string, config *NetworkConfig) ([]Endpoint, error) { 1118 var endpoints []Endpoint 1119 1120 netnsHandle, err := netns.GetFromPath(networkNSPath) 1121 if err != nil { 1122 return []Endpoint{}, err 1123 } 1124 defer netnsHandle.Close() 1125 1126 netlinkHandle, err := netlink.NewHandleAt(netnsHandle) 1127 if err != nil { 1128 return []Endpoint{}, err 1129 } 1130 defer netlinkHandle.Delete() 1131 1132 linkList, err := netlinkHandle.LinkList() 1133 if err != nil { 1134 return []Endpoint{}, err 1135 } 1136 1137 idx := 0 1138 for _, link := range linkList { 1139 var ( 1140 endpoint Endpoint 1141 errCreate error 1142 ) 1143 1144 netInfo, err := networkInfoFromLink(netlinkHandle, link) 1145 if err != nil { 1146 return []Endpoint{}, err 1147 } 1148 1149 // Ignore unconfigured network interfaces. These are 1150 // either base tunnel devices that are not namespaced 1151 // like gre0, gretap0, sit0, ipip0, tunl0 or incorrectly 1152 // setup interfaces. 1153 if len(netInfo.Addrs) == 0 { 1154 continue 1155 } 1156 1157 // Skip any loopback interfaces: 1158 if (netInfo.Iface.Flags & net.FlagLoopback) != 0 { 1159 continue 1160 } 1161 1162 if err := doNetNS(networkNSPath, func(_ ns.NetNS) error { 1163 endpoint, errCreate = createEndpoint(netInfo, idx, config.InterworkingModel, link) 1164 return errCreate 1165 }); err != nil { 1166 return []Endpoint{}, err 1167 } 1168 1169 endpoint.SetProperties(netInfo) 1170 endpoints = append(endpoints, endpoint) 1171 1172 idx++ 1173 } 1174 1175 sort.Slice(endpoints, func(i, j int) bool { 1176 return endpoints[i].Name() < endpoints[j].Name() 1177 }) 1178 1179 networkLogger().WithField("endpoints", endpoints).Info("Endpoints found after scan") 1180 1181 return endpoints, nil 1182 } 1183 1184 func createEndpoint(netInfo NetworkInfo, idx int, model NetInterworkingModel, link netlink.Link) (Endpoint, error) { 1185 var endpoint Endpoint 1186 // TODO: This is the incoming interface 1187 // based on the incoming interface we should create 1188 // an appropriate EndPoint based on interface type 1189 // This should be a switch 1190 1191 // Check if interface is a physical interface. Do not create 1192 // tap interface/bridge if it is. 1193 isPhysical, err := isPhysicalIface(netInfo.Iface.Name) 1194 if err != nil { 1195 return nil, err 1196 } 1197 1198 if isPhysical { 1199 networkLogger().WithField("interface", netInfo.Iface.Name).Info("Physical network interface found") 1200 endpoint, err = createPhysicalEndpoint(netInfo) 1201 } else { 1202 var socketPath string 1203 1204 // Check if this is a dummy interface which has a vhost-user socket associated with it 1205 socketPath, err = vhostUserSocketPath(netInfo) 1206 if err != nil { 1207 return nil, err 1208 } 1209 1210 if socketPath != "" { 1211 networkLogger().WithField("interface", netInfo.Iface.Name).Info("VhostUser network interface found") 1212 endpoint, err = createVhostUserEndpoint(netInfo, socketPath) 1213 } else if netInfo.Iface.Type == "macvlan" { 1214 networkLogger().Infof("macvlan interface found") 1215 endpoint, err = createBridgedMacvlanNetworkEndpoint(idx, netInfo.Iface.Name, model) 1216 } else if netInfo.Iface.Type == "macvtap" { 1217 networkLogger().Infof("macvtap interface found") 1218 endpoint, err = createMacvtapNetworkEndpoint(netInfo) 1219 } else if netInfo.Iface.Type == "tap" { 1220 networkLogger().Info("tap interface found") 1221 endpoint, err = createTapNetworkEndpoint(idx, netInfo.Iface.Name) 1222 } else if netInfo.Iface.Type == "tuntap" { 1223 if link != nil { 1224 switch link.(*netlink.Tuntap).Mode { 1225 case 0: 1226 // mount /sys/class/net to get links 1227 return nil, fmt.Errorf("Network device mode not determined correctly. Mount sysfs in caller") 1228 case 1: 1229 return nil, fmt.Errorf("tun networking device not yet supported") 1230 case 2: 1231 networkLogger().Info("tuntap tap interface found") 1232 endpoint, err = createTuntapNetworkEndpoint(idx, netInfo.Iface.Name, netInfo.Iface.HardwareAddr, model) 1233 default: 1234 return nil, fmt.Errorf("tuntap network %v mode unsupported", link.(*netlink.Tuntap).Mode) 1235 } 1236 } 1237 } else if netInfo.Iface.Type == "veth" { 1238 endpoint, err = createVethNetworkEndpoint(idx, netInfo.Iface.Name, model) 1239 } else if netInfo.Iface.Type == "ipvlan" { 1240 endpoint, err = createIPVlanNetworkEndpoint(idx, netInfo.Iface.Name) 1241 } else { 1242 return nil, fmt.Errorf("Unsupported network interface: %s", netInfo.Iface.Type) 1243 } 1244 } 1245 1246 return endpoint, err 1247 } 1248 1249 // Network is the virtcontainer network structure 1250 type Network struct { 1251 } 1252 1253 func (n *Network) trace(ctx context.Context, name string) (opentracing.Span, context.Context) { 1254 span, ct := opentracing.StartSpanFromContext(ctx, name) 1255 1256 span.SetTag("subsystem", "network") 1257 span.SetTag("type", "default") 1258 1259 return span, ct 1260 } 1261 1262 // Run runs a callback in the specified network namespace. 1263 func (n *Network) Run(networkNSPath string, cb func() error) error { 1264 span, _ := n.trace(context.Background(), "run") 1265 defer span.Finish() 1266 1267 return doNetNS(networkNSPath, func(_ ns.NetNS) error { 1268 return cb() 1269 }) 1270 } 1271 1272 // Add adds all needed interfaces inside the network namespace. 1273 func (n *Network) Add(ctx context.Context, config *NetworkConfig, s *Sandbox, hotplug bool) ([]Endpoint, error) { 1274 span, _ := n.trace(ctx, "add") 1275 defer span.Finish() 1276 1277 endpoints, err := createEndpointsFromScan(config.NetNSPath, config) 1278 if err != nil { 1279 return endpoints, err 1280 } 1281 1282 err = doNetNS(config.NetNSPath, func(_ ns.NetNS) error { 1283 for _, endpoint := range endpoints { 1284 networkLogger().WithField("endpoint-type", endpoint.Type()).WithField("hotplug", hotplug).Info("Attaching endpoint") 1285 if hotplug { 1286 if err := endpoint.HotAttach(s.hypervisor); err != nil { 1287 return err 1288 } 1289 } else { 1290 if err := endpoint.Attach(s); err != nil { 1291 return err 1292 } 1293 } 1294 } 1295 1296 return nil 1297 }) 1298 if err != nil { 1299 return []Endpoint{}, err 1300 } 1301 1302 networkLogger().Debug("Network added") 1303 1304 return endpoints, nil 1305 } 1306 1307 func (n *Network) PostAdd(ctx context.Context, ns *NetworkNamespace, hotplug bool) error { 1308 if hotplug { 1309 return nil 1310 } 1311 1312 if ns.Endpoints == nil { 1313 return nil 1314 } 1315 1316 endpoints := ns.Endpoints 1317 1318 for _, endpoint := range endpoints { 1319 netPair := endpoint.NetworkPair() 1320 if netPair == nil { 1321 continue 1322 } 1323 if netPair.VhostFds != nil { 1324 for _, VhostFd := range netPair.VhostFds { 1325 VhostFd.Close() 1326 } 1327 } 1328 } 1329 1330 return nil 1331 } 1332 1333 // Remove network endpoints in the network namespace. It also deletes the network 1334 // namespace in case the namespace has been created by us. 1335 func (n *Network) Remove(ctx context.Context, ns *NetworkNamespace, hypervisor hypervisor) error { 1336 span, _ := n.trace(ctx, "remove") 1337 defer span.Finish() 1338 1339 for _, endpoint := range ns.Endpoints { 1340 // Detach for an endpoint should enter the network namespace 1341 // if required. 1342 networkLogger().WithField("endpoint-type", endpoint.Type()).Info("Detaching endpoint") 1343 if err := endpoint.Detach(ns.NetNsCreated, ns.NetNsPath); err != nil { 1344 return err 1345 } 1346 } 1347 1348 networkLogger().Debug("Network removed") 1349 1350 if ns.NetNsCreated { 1351 networkLogger().Infof("Network namespace %q deleted", ns.NetNsPath) 1352 return deleteNetNS(ns.NetNsPath) 1353 } 1354 1355 return nil 1356 }