gitee.com/leisunstar/runtime@v0.0.0-20200521203717-5cef3e7b53f9/virtcontainers/network.go (about) 1 // Copyright (c) 2016 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 6 package virtcontainers 7 8 import ( 9 "context" 10 cryptoRand "crypto/rand" 11 "encoding/json" 12 "fmt" 13 "math/rand" 14 "net" 15 "os" 16 "runtime" 17 "sort" 18 "time" 19 20 "github.com/containernetworking/plugins/pkg/ns" 21 "github.com/containernetworking/plugins/pkg/testutils" 22 opentracing "github.com/opentracing/opentracing-go" 23 "github.com/sirupsen/logrus" 24 "github.com/vishvananda/netlink" 25 "github.com/vishvananda/netns" 26 "golang.org/x/sys/unix" 27 28 "github.com/kata-containers/runtime/virtcontainers/pkg/rootless" 29 vcTypes "github.com/kata-containers/runtime/virtcontainers/pkg/types" 30 "github.com/kata-containers/runtime/virtcontainers/pkg/uuid" 31 "github.com/kata-containers/runtime/virtcontainers/utils" 32 ) 33 34 // NetInterworkingModel defines the network model connecting 35 // the network interface to the virtual machine. 36 type NetInterworkingModel int 37 38 const ( 39 // NetXConnectDefaultModel Ask to use DefaultNetInterworkingModel 40 NetXConnectDefaultModel NetInterworkingModel = iota 41 42 // NetXConnectMacVtapModel can be used when the Container network 43 // interface can be bridged using macvtap 44 NetXConnectMacVtapModel 45 46 // NetXConnectTCFilterModel redirects traffic from the network interface 47 // provided by the network plugin to a tap interface. 48 // This works for ipvlan and macvlan as well. 49 NetXConnectTCFilterModel 50 51 // NetXConnectNoneModel can be used when the VM is in the host network namespace 52 NetXConnectNoneModel 53 54 // NetXConnectInvalidModel is the last item to check valid values by IsValid() 55 NetXConnectInvalidModel 56 ) 57 58 //IsValid checks if a model is valid 59 func (n NetInterworkingModel) IsValid() bool { 60 return 0 <= int(n) && int(n) < int(NetXConnectInvalidModel) 61 } 62 63 const ( 64 defaultNetModelStr = "default" 65 66 macvtapNetModelStr = "macvtap" 67 68 tcFilterNetModelStr = "tcfilter" 69 70 noneNetModelStr = "none" 71 ) 72 73 //SetModel change the model string value 74 func (n *NetInterworkingModel) SetModel(modelName string) error { 75 switch modelName { 76 case defaultNetModelStr: 77 *n = DefaultNetInterworkingModel 78 return nil 79 case macvtapNetModelStr: 80 *n = NetXConnectMacVtapModel 81 return nil 82 case tcFilterNetModelStr: 83 *n = NetXConnectTCFilterModel 84 return nil 85 case noneNetModelStr: 86 *n = NetXConnectNoneModel 87 return nil 88 } 89 return fmt.Errorf("Unknown type %s", modelName) 90 } 91 92 // DefaultNetInterworkingModel is a package level default 93 // that determines how the VM should be connected to the 94 // the container network interface 95 var DefaultNetInterworkingModel = NetXConnectTCFilterModel 96 97 // Introduces constants related to networking 98 const ( 99 defaultFilePerms = 0600 100 defaultQlen = 1500 101 ) 102 103 // DNSInfo describes the DNS setup related to a network interface. 104 type DNSInfo struct { 105 Servers []string 106 Domain string 107 Searches []string 108 Options []string 109 } 110 111 // NetlinkIface describes fully a network interface. 112 type NetlinkIface struct { 113 netlink.LinkAttrs 114 Type string 115 } 116 117 // NetworkInfo gathers all information related to a network interface. 118 // It can be used to store the description of the underlying network. 119 type NetworkInfo struct { 120 Iface NetlinkIface 121 Addrs []netlink.Addr 122 Routes []netlink.Route 123 DNS DNSInfo 124 } 125 126 // NetworkInterface defines a network interface. 127 type NetworkInterface struct { 128 Name string 129 HardAddr string 130 Addrs []netlink.Addr 131 } 132 133 // TapInterface defines a tap interface 134 type TapInterface struct { 135 ID string 136 Name string 137 TAPIface NetworkInterface 138 VMFds []*os.File 139 VhostFds []*os.File 140 } 141 142 // TuntapInterface defines a tap interface 143 type TuntapInterface struct { 144 Name string 145 TAPIface NetworkInterface 146 } 147 148 // NetworkInterfacePair defines a pair between VM and virtual network interfaces. 149 type NetworkInterfacePair struct { 150 TapInterface 151 VirtIface NetworkInterface 152 NetInterworkingModel 153 } 154 155 // NetworkConfig is the network configuration related to a network. 156 type NetworkConfig struct { 157 NetNSPath string 158 NetNsCreated bool 159 DisableNewNetNs bool 160 NetmonConfig NetmonConfig 161 InterworkingModel NetInterworkingModel 162 } 163 164 func networkLogger() *logrus.Entry { 165 return virtLog.WithField("subsystem", "network") 166 } 167 168 // NetworkNamespace contains all data related to its network namespace. 169 type NetworkNamespace struct { 170 NetNsPath string 171 NetNsCreated bool 172 Endpoints []Endpoint 173 NetmonPID int 174 } 175 176 // TypedJSONEndpoint is used as an intermediate representation for 177 // marshalling and unmarshalling Endpoint objects. 178 type TypedJSONEndpoint struct { 179 Type EndpointType 180 Data json.RawMessage 181 } 182 183 // MarshalJSON is the custom NetworkNamespace JSON marshalling routine. 184 // This is needed to properly marshall Endpoints array. 185 func (n NetworkNamespace) MarshalJSON() ([]byte, error) { 186 // We need a shadow structure in order to prevent json from 187 // entering a recursive loop when only calling json.Marshal(). 188 type shadow struct { 189 NetNsPath string 190 NetNsCreated bool 191 Endpoints []TypedJSONEndpoint 192 } 193 194 s := &shadow{ 195 NetNsPath: n.NetNsPath, 196 NetNsCreated: n.NetNsCreated, 197 } 198 199 var typedEndpoints []TypedJSONEndpoint 200 for _, endpoint := range n.Endpoints { 201 tempJSON, _ := json.Marshal(endpoint) 202 203 t := TypedJSONEndpoint{ 204 Type: endpoint.Type(), 205 Data: tempJSON, 206 } 207 208 typedEndpoints = append(typedEndpoints, t) 209 } 210 211 s.Endpoints = typedEndpoints 212 213 b, err := json.Marshal(s) 214 return b, err 215 } 216 217 func generateEndpoints(typedEndpoints []TypedJSONEndpoint) ([]Endpoint, error) { 218 var endpoints []Endpoint 219 220 for _, e := range typedEndpoints { 221 var endpointInf Endpoint 222 switch e.Type { 223 case PhysicalEndpointType: 224 var endpoint PhysicalEndpoint 225 endpointInf = &endpoint 226 227 case VethEndpointType: 228 var endpoint VethEndpoint 229 endpointInf = &endpoint 230 231 case VhostUserEndpointType: 232 var endpoint VhostUserEndpoint 233 endpointInf = &endpoint 234 235 case BridgedMacvlanEndpointType: 236 var endpoint BridgedMacvlanEndpoint 237 endpointInf = &endpoint 238 239 case MacvtapEndpointType: 240 var endpoint MacvtapEndpoint 241 endpointInf = &endpoint 242 243 case TapEndpointType: 244 var endpoint TapEndpoint 245 endpointInf = &endpoint 246 247 case IPVlanEndpointType: 248 var endpoint IPVlanEndpoint 249 endpointInf = &endpoint 250 251 case TuntapEndpointType: 252 var endpoint TuntapEndpoint 253 endpointInf = &endpoint 254 255 default: 256 networkLogger().WithField("endpoint-type", e.Type).Error("Ignoring unknown endpoint type") 257 } 258 259 err := json.Unmarshal(e.Data, endpointInf) 260 if err != nil { 261 return nil, err 262 } 263 264 endpoints = append(endpoints, endpointInf) 265 networkLogger().WithFields(logrus.Fields{ 266 "endpoint": endpointInf, 267 "endpoint-type": e.Type, 268 }).Info("endpoint unmarshalled") 269 } 270 return endpoints, nil 271 } 272 273 // UnmarshalJSON is the custom NetworkNamespace unmarshalling routine. 274 // This is needed for unmarshalling the Endpoints interfaces array. 275 func (n *NetworkNamespace) UnmarshalJSON(b []byte) error { 276 var s struct { 277 NetNsPath string 278 NetNsCreated bool 279 Endpoints json.RawMessage 280 } 281 282 if err := json.Unmarshal(b, &s); err != nil { 283 return err 284 } 285 286 (*n).NetNsPath = s.NetNsPath 287 (*n).NetNsCreated = s.NetNsCreated 288 289 var typedEndpoints []TypedJSONEndpoint 290 if err := json.Unmarshal([]byte(string(s.Endpoints)), &typedEndpoints); err != nil { 291 return err 292 } 293 endpoints, err := generateEndpoints(typedEndpoints) 294 if err != nil { 295 return err 296 } 297 298 (*n).Endpoints = endpoints 299 return nil 300 } 301 302 func createLink(netHandle *netlink.Handle, name string, expectedLink netlink.Link, queues int) (netlink.Link, []*os.File, error) { 303 var newLink netlink.Link 304 var fds []*os.File 305 306 switch expectedLink.Type() { 307 case (&netlink.Tuntap{}).Type(): 308 flags := netlink.TUNTAP_VNET_HDR 309 if queues > 0 { 310 flags |= netlink.TUNTAP_MULTI_QUEUE_DEFAULTS 311 } 312 newLink = &netlink.Tuntap{ 313 LinkAttrs: netlink.LinkAttrs{Name: name}, 314 Mode: netlink.TUNTAP_MODE_TAP, 315 Queues: queues, 316 Flags: flags, 317 } 318 case (&netlink.Macvtap{}).Type(): 319 qlen := expectedLink.Attrs().TxQLen 320 if qlen <= 0 { 321 qlen = defaultQlen 322 } 323 newLink = &netlink.Macvtap{ 324 Macvlan: netlink.Macvlan{ 325 Mode: netlink.MACVLAN_MODE_BRIDGE, 326 LinkAttrs: netlink.LinkAttrs{ 327 Index: expectedLink.Attrs().Index, 328 Name: name, 329 TxQLen: qlen, 330 ParentIndex: expectedLink.Attrs().ParentIndex, 331 }, 332 }, 333 } 334 default: 335 return nil, fds, fmt.Errorf("Unsupported link type %s", expectedLink.Type()) 336 } 337 338 if err := netHandle.LinkAdd(newLink); err != nil { 339 return nil, fds, fmt.Errorf("LinkAdd() failed for %s name %s: %s", expectedLink.Type(), name, err) 340 } 341 342 tuntapLink, ok := newLink.(*netlink.Tuntap) 343 if ok { 344 fds = tuntapLink.Fds 345 } 346 347 newLink, err := getLinkByName(netHandle, name, expectedLink) 348 return newLink, fds, err 349 } 350 351 func getLinkForEndpoint(endpoint Endpoint, netHandle *netlink.Handle) (netlink.Link, error) { 352 var link netlink.Link 353 354 switch ep := endpoint.(type) { 355 case *VethEndpoint: 356 link = &netlink.Veth{} 357 case *BridgedMacvlanEndpoint: 358 link = &netlink.Macvlan{} 359 case *IPVlanEndpoint: 360 link = &netlink.IPVlan{} 361 case *TuntapEndpoint: 362 link = &netlink.Tuntap{} 363 default: 364 return nil, fmt.Errorf("Unexpected endpointType %s", ep.Type()) 365 } 366 367 return getLinkByName(netHandle, endpoint.NetworkPair().VirtIface.Name, link) 368 } 369 370 func getLinkByName(netHandle *netlink.Handle, name string, expectedLink netlink.Link) (netlink.Link, error) { 371 link, err := netHandle.LinkByName(name) 372 if err != nil { 373 return nil, fmt.Errorf("LinkByName() failed for %s name %s: %s", expectedLink.Type(), name, err) 374 } 375 376 switch expectedLink.Type() { 377 case (&netlink.Tuntap{}).Type(): 378 if l, ok := link.(*netlink.Tuntap); ok { 379 return l, nil 380 } 381 case (&netlink.Veth{}).Type(): 382 if l, ok := link.(*netlink.Veth); ok { 383 return l, nil 384 } 385 case (&netlink.Macvtap{}).Type(): 386 if l, ok := link.(*netlink.Macvtap); ok { 387 return l, nil 388 } 389 case (&netlink.Macvlan{}).Type(): 390 if l, ok := link.(*netlink.Macvlan); ok { 391 return l, nil 392 } 393 case (&netlink.IPVlan{}).Type(): 394 if l, ok := link.(*netlink.IPVlan); ok { 395 return l, nil 396 } 397 default: 398 return nil, fmt.Errorf("Unsupported link type %s", expectedLink.Type()) 399 } 400 401 return nil, fmt.Errorf("Incorrect link type %s, expecting %s", link.Type(), expectedLink.Type()) 402 } 403 404 // The endpoint type should dictate how the connection needs to happen. 405 func xConnectVMNetwork(endpoint Endpoint, h hypervisor) error { 406 netPair := endpoint.NetworkPair() 407 408 queues := 0 409 caps := h.capabilities() 410 if caps.IsMultiQueueSupported() { 411 queues = int(h.hypervisorConfig().NumVCPUs) 412 } 413 414 var disableVhostNet bool 415 if rootless.IsRootless() { 416 disableVhostNet = true 417 } else { 418 disableVhostNet = h.hypervisorConfig().DisableVhostNet 419 } 420 421 if netPair.NetInterworkingModel == NetXConnectDefaultModel { 422 netPair.NetInterworkingModel = DefaultNetInterworkingModel 423 } 424 425 switch netPair.NetInterworkingModel { 426 case NetXConnectMacVtapModel: 427 return tapNetworkPair(endpoint, queues, disableVhostNet) 428 case NetXConnectTCFilterModel: 429 return setupTCFiltering(endpoint, queues, disableVhostNet) 430 default: 431 return fmt.Errorf("Invalid internetworking model") 432 } 433 } 434 435 // The endpoint type should dictate how the disconnection needs to happen. 436 func xDisconnectVMNetwork(endpoint Endpoint) error { 437 netPair := endpoint.NetworkPair() 438 439 if netPair.NetInterworkingModel == NetXConnectDefaultModel { 440 netPair.NetInterworkingModel = DefaultNetInterworkingModel 441 } 442 443 switch netPair.NetInterworkingModel { 444 case NetXConnectMacVtapModel: 445 return untapNetworkPair(endpoint) 446 case NetXConnectTCFilterModel: 447 return removeTCFiltering(endpoint) 448 default: 449 return fmt.Errorf("Invalid internetworking model") 450 } 451 } 452 453 func createMacvtapFds(linkIndex int, queues int) ([]*os.File, error) { 454 tapDev := fmt.Sprintf("/dev/tap%d", linkIndex) 455 return createFds(tapDev, queues) 456 } 457 458 func createVhostFds(numFds int) ([]*os.File, error) { 459 vhostDev := "/dev/vhost-net" 460 return createFds(vhostDev, numFds) 461 } 462 463 func createFds(device string, numFds int) ([]*os.File, error) { 464 fds := make([]*os.File, numFds) 465 466 for i := 0; i < numFds; i++ { 467 f, err := os.OpenFile(device, os.O_RDWR, defaultFilePerms) 468 if err != nil { 469 utils.CleanupFds(fds, i) 470 return nil, err 471 } 472 fds[i] = f 473 } 474 return fds, nil 475 } 476 477 // There is a limitation in the linux kernel that prevents a macvtap/macvlan link 478 // from getting the correct link index when created in a network namespace 479 // https://github.com/clearcontainers/runtime/issues/708 480 // 481 // Till that bug is fixed we need to pick a random non conflicting index and try to 482 // create a link. If that fails, we need to try with another. 483 // All the kernel does not check if the link id conflicts with a link id on the host 484 // hence we need to offset the link id to prevent any overlaps with the host index 485 // 486 // Here the kernel will ensure that there is no race condition 487 488 const hostLinkOffset = 8192 // Host should not have more than 8k interfaces 489 const linkRange = 0xFFFF // This will allow upto 2^16 containers 490 const linkRetries = 128 // The numbers of time we try to find a non conflicting index 491 const macvtapWorkaround = true 492 493 func createMacVtap(netHandle *netlink.Handle, name string, link netlink.Link, queues int) (taplink netlink.Link, err error) { 494 495 if !macvtapWorkaround { 496 taplink, _, err = createLink(netHandle, name, link, queues) 497 return 498 } 499 500 r := rand.New(rand.NewSource(time.Now().UnixNano())) 501 502 for i := 0; i < linkRetries; i++ { 503 index := hostLinkOffset + (r.Int() & linkRange) 504 link.Attrs().Index = index 505 taplink, _, err = createLink(netHandle, name, link, queues) 506 if err == nil { 507 break 508 } 509 } 510 511 return 512 } 513 514 func clearIPs(link netlink.Link, addrs []netlink.Addr) error { 515 for _, addr := range addrs { 516 if err := netlink.AddrDel(link, &addr); err != nil { 517 return err 518 } 519 } 520 return nil 521 } 522 523 func setIPs(link netlink.Link, addrs []netlink.Addr) error { 524 for _, addr := range addrs { 525 if err := netlink.AddrAdd(link, &addr); err != nil { 526 return err 527 } 528 } 529 return nil 530 } 531 532 func tapNetworkPair(endpoint Endpoint, queues int, disableVhostNet bool) error { 533 netHandle, err := netlink.NewHandle() 534 if err != nil { 535 return err 536 } 537 defer netHandle.Delete() 538 539 netPair := endpoint.NetworkPair() 540 541 link, err := getLinkForEndpoint(endpoint, netHandle) 542 if err != nil { 543 return err 544 } 545 546 attrs := link.Attrs() 547 548 // Attach the macvtap interface to the underlying container 549 // interface. Also picks relevant attributes from the parent 550 tapLink, err := createMacVtap(netHandle, netPair.TAPIface.Name, 551 &netlink.Macvtap{ 552 Macvlan: netlink.Macvlan{ 553 LinkAttrs: netlink.LinkAttrs{ 554 TxQLen: attrs.TxQLen, 555 ParentIndex: attrs.Index, 556 }, 557 }, 558 }, queues) 559 560 if err != nil { 561 return fmt.Errorf("Could not create TAP interface: %s", err) 562 } 563 564 // Save the veth MAC address to the TAP so that it can later be used 565 // to build the hypervisor command line. This MAC address has to be 566 // the one inside the VM in order to avoid any firewall issues. The 567 // bridge created by the network plugin on the host actually expects 568 // to see traffic from this MAC address and not another one. 569 tapHardAddr := attrs.HardwareAddr 570 netPair.TAPIface.HardAddr = attrs.HardwareAddr.String() 571 572 if err := netHandle.LinkSetMTU(tapLink, attrs.MTU); err != nil { 573 return fmt.Errorf("Could not set TAP MTU %d: %s", attrs.MTU, err) 574 } 575 576 hardAddr, err := net.ParseMAC(netPair.VirtIface.HardAddr) 577 if err != nil { 578 return err 579 } 580 if err := netHandle.LinkSetHardwareAddr(link, hardAddr); err != nil { 581 return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s", 582 netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err) 583 } 584 585 if err := netHandle.LinkSetHardwareAddr(tapLink, tapHardAddr); err != nil { 586 return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s", 587 netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err) 588 } 589 590 if err := netHandle.LinkSetUp(tapLink); err != nil { 591 return fmt.Errorf("Could not enable TAP %s: %s", netPair.TAPIface.Name, err) 592 } 593 594 // Clear the IP addresses from the veth interface to prevent ARP conflict 595 netPair.VirtIface.Addrs, err = netlink.AddrList(link, netlink.FAMILY_ALL) 596 if err != nil { 597 return fmt.Errorf("Unable to obtain veth IP addresses: %s", err) 598 } 599 600 if err := clearIPs(link, netPair.VirtIface.Addrs); err != nil { 601 return fmt.Errorf("Unable to clear veth IP addresses: %s", err) 602 } 603 604 if err := netHandle.LinkSetUp(link); err != nil { 605 return fmt.Errorf("Could not enable veth %s: %s", netPair.VirtIface.Name, err) 606 } 607 608 // Note: The underlying interfaces need to be up prior to fd creation. 609 610 netPair.VMFds, err = createMacvtapFds(tapLink.Attrs().Index, queues) 611 if err != nil { 612 return fmt.Errorf("Could not setup macvtap fds %s: %s", netPair.TAPIface, err) 613 } 614 615 if !disableVhostNet { 616 vhostFds, err := createVhostFds(queues) 617 if err != nil { 618 return fmt.Errorf("Could not setup vhost fds %s : %s", netPair.VirtIface.Name, err) 619 } 620 netPair.VhostFds = vhostFds 621 } 622 623 return nil 624 } 625 626 func setupTCFiltering(endpoint Endpoint, queues int, disableVhostNet bool) error { 627 netHandle, err := netlink.NewHandle() 628 if err != nil { 629 return err 630 } 631 defer netHandle.Delete() 632 633 netPair := endpoint.NetworkPair() 634 635 tapLink, fds, err := createLink(netHandle, netPair.TAPIface.Name, &netlink.Tuntap{}, queues) 636 if err != nil { 637 return fmt.Errorf("Could not create TAP interface: %s", err) 638 } 639 netPair.VMFds = fds 640 641 if !disableVhostNet { 642 vhostFds, err := createVhostFds(queues) 643 if err != nil { 644 return fmt.Errorf("Could not setup vhost fds %s : %s", netPair.VirtIface.Name, err) 645 } 646 netPair.VhostFds = vhostFds 647 } 648 649 var attrs *netlink.LinkAttrs 650 var link netlink.Link 651 652 link, err = getLinkForEndpoint(endpoint, netHandle) 653 if err != nil { 654 return err 655 } 656 657 attrs = link.Attrs() 658 659 // Save the veth MAC address to the TAP so that it can later be used 660 // to build the hypervisor command line. This MAC address has to be 661 // the one inside the VM in order to avoid any firewall issues. The 662 // bridge created by the network plugin on the host actually expects 663 // to see traffic from this MAC address and not another one. 664 netPair.TAPIface.HardAddr = attrs.HardwareAddr.String() 665 666 if err := netHandle.LinkSetMTU(tapLink, attrs.MTU); err != nil { 667 return fmt.Errorf("Could not set TAP MTU %d: %s", attrs.MTU, err) 668 } 669 670 if err := netHandle.LinkSetUp(tapLink); err != nil { 671 return fmt.Errorf("Could not enable TAP %s: %s", netPair.TAPIface.Name, err) 672 } 673 674 tapAttrs := tapLink.Attrs() 675 676 if err := addQdiscIngress(tapAttrs.Index); err != nil { 677 return err 678 } 679 680 if err := addQdiscIngress(attrs.Index); err != nil { 681 return err 682 } 683 684 if err := addRedirectTCFilter(attrs.Index, tapAttrs.Index); err != nil { 685 return err 686 } 687 688 if err := addRedirectTCFilter(tapAttrs.Index, attrs.Index); err != nil { 689 return err 690 } 691 692 return nil 693 } 694 695 // addQdiscIngress creates a new qdisc for nwtwork interface with the specified network index 696 // on "ingress". qdiscs normally don't work on ingress so this is really a special qdisc 697 // that you can consider an "alternate root" for inbound packets. 698 // Handle for ingress qdisc defaults to "ffff:" 699 // 700 // This is equivalent to calling `tc qdisc add dev eth0 ingress` 701 func addQdiscIngress(index int) error { 702 qdisc := &netlink.Ingress{ 703 QdiscAttrs: netlink.QdiscAttrs{ 704 LinkIndex: index, 705 Parent: netlink.HANDLE_INGRESS, 706 }, 707 } 708 709 err := netlink.QdiscAdd(qdisc) 710 if err != nil { 711 return fmt.Errorf("Failed to add qdisc for network index %d : %s", index, err) 712 } 713 714 return nil 715 } 716 717 // addRedirectTCFilter adds a tc filter for device with index "sourceIndex". 718 // All traffic for interface with index "sourceIndex" is redirected to interface with 719 // index "destIndex" 720 // 721 // This is equivalent to calling: 722 // `tc filter add dev source parent ffff: protocol all u32 match u8 0 0 action mirred egress redirect dev dest` 723 func addRedirectTCFilter(sourceIndex, destIndex int) error { 724 filter := &netlink.U32{ 725 FilterAttrs: netlink.FilterAttrs{ 726 LinkIndex: sourceIndex, 727 Parent: netlink.MakeHandle(0xffff, 0), 728 Protocol: unix.ETH_P_ALL, 729 }, 730 Actions: []netlink.Action{ 731 &netlink.MirredAction{ 732 ActionAttrs: netlink.ActionAttrs{ 733 Action: netlink.TC_ACT_STOLEN, 734 }, 735 MirredAction: netlink.TCA_EGRESS_REDIR, 736 Ifindex: destIndex, 737 }, 738 }, 739 } 740 741 if err := netlink.FilterAdd(filter); err != nil { 742 return fmt.Errorf("Failed to add filter for index %d : %s", sourceIndex, err) 743 } 744 745 return nil 746 } 747 748 // removeRedirectTCFilter removes all tc u32 filters created on ingress qdisc for "link". 749 func removeRedirectTCFilter(link netlink.Link) error { 750 if link == nil { 751 return nil 752 } 753 754 // Handle 0xffff is used for ingress 755 filters, err := netlink.FilterList(link, netlink.MakeHandle(0xffff, 0)) 756 if err != nil { 757 return err 758 } 759 760 for _, f := range filters { 761 u32, ok := f.(*netlink.U32) 762 763 if !ok { 764 continue 765 } 766 767 if err := netlink.FilterDel(u32); err != nil { 768 return err 769 } 770 } 771 return nil 772 } 773 774 // removeQdiscIngress removes the ingress qdisc previously created on "link". 775 func removeQdiscIngress(link netlink.Link) error { 776 if link == nil { 777 return nil 778 } 779 780 qdiscs, err := netlink.QdiscList(link) 781 if err != nil { 782 return err 783 } 784 785 for _, qdisc := range qdiscs { 786 ingress, ok := qdisc.(*netlink.Ingress) 787 if !ok { 788 continue 789 } 790 791 if err := netlink.QdiscDel(ingress); err != nil { 792 return err 793 } 794 } 795 return nil 796 } 797 798 func untapNetworkPair(endpoint Endpoint) error { 799 netHandle, err := netlink.NewHandle() 800 if err != nil { 801 return err 802 } 803 defer netHandle.Delete() 804 805 netPair := endpoint.NetworkPair() 806 807 tapLink, err := getLinkByName(netHandle, netPair.TAPIface.Name, &netlink.Macvtap{}) 808 if err != nil { 809 return fmt.Errorf("Could not get TAP interface %s: %s", netPair.TAPIface.Name, err) 810 } 811 812 if err := netHandle.LinkDel(tapLink); err != nil { 813 return fmt.Errorf("Could not remove TAP %s: %s", netPair.TAPIface.Name, err) 814 } 815 816 link, err := getLinkForEndpoint(endpoint, netHandle) 817 if err != nil { 818 return err 819 } 820 821 hardAddr, err := net.ParseMAC(netPair.TAPIface.HardAddr) 822 if err != nil { 823 return err 824 } 825 if err := netHandle.LinkSetHardwareAddr(link, hardAddr); err != nil { 826 return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s", 827 netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err) 828 } 829 830 if err := netHandle.LinkSetDown(link); err != nil { 831 return fmt.Errorf("Could not disable veth %s: %s", netPair.VirtIface.Name, err) 832 } 833 834 // Restore the IPs that were cleared 835 err = setIPs(link, netPair.VirtIface.Addrs) 836 return err 837 } 838 839 func removeTCFiltering(endpoint Endpoint) error { 840 netHandle, err := netlink.NewHandle() 841 if err != nil { 842 return err 843 } 844 defer netHandle.Delete() 845 846 netPair := endpoint.NetworkPair() 847 848 tapLink, err := getLinkByName(netHandle, netPair.TAPIface.Name, &netlink.Tuntap{}) 849 if err != nil { 850 return fmt.Errorf("Could not get TAP interface: %s", err) 851 } 852 853 if err := netHandle.LinkSetDown(tapLink); err != nil { 854 return fmt.Errorf("Could not disable TAP %s: %s", netPair.TAPIface.Name, err) 855 } 856 857 if err := netHandle.LinkDel(tapLink); err != nil { 858 return fmt.Errorf("Could not remove TAP %s: %s", netPair.TAPIface.Name, err) 859 } 860 861 link, err := getLinkForEndpoint(endpoint, netHandle) 862 if err != nil { 863 return err 864 } 865 866 if err := removeRedirectTCFilter(link); err != nil { 867 return err 868 } 869 870 if err := removeQdiscIngress(link); err != nil { 871 return err 872 } 873 874 if err := netHandle.LinkSetDown(link); err != nil { 875 return fmt.Errorf("Could not disable veth %s: %s", netPair.VirtIface.Name, err) 876 } 877 878 return nil 879 } 880 881 func createNetNS() (string, error) { 882 n, err := testutils.NewNS() 883 if err != nil { 884 return "", err 885 } 886 887 return n.Path(), nil 888 } 889 890 // doNetNS is free from any call to a go routine, and it calls 891 // into runtime.LockOSThread(), meaning it won't be executed in a 892 // different thread than the one expected by the caller. 893 func doNetNS(netNSPath string, cb func(ns.NetNS) error) error { 894 // if netNSPath is empty, the callback function will be run in the current network namespace. 895 // So skip the whole function, just call cb(). cb() needs a NetNS as arg but ignored, give it a fake one. 896 if netNSPath == "" { 897 var netNs ns.NetNS 898 return cb(netNs) 899 } 900 901 runtime.LockOSThread() 902 defer runtime.UnlockOSThread() 903 904 currentNS, err := ns.GetCurrentNS() 905 if err != nil { 906 return err 907 } 908 defer currentNS.Close() 909 910 targetNS, err := ns.GetNS(netNSPath) 911 if err != nil { 912 return err 913 } 914 915 if err := targetNS.Set(); err != nil { 916 return err 917 } 918 defer currentNS.Set() 919 920 return cb(targetNS) 921 } 922 923 func deleteNetNS(netNSPath string) error { 924 n, err := ns.GetNS(netNSPath) 925 if err != nil { 926 return err 927 } 928 929 err = n.Close() 930 if err != nil { 931 return err 932 } 933 934 if err = unix.Unmount(netNSPath, unix.MNT_DETACH); err != nil { 935 return fmt.Errorf("Failed to unmount namespace %s: %v", netNSPath, err) 936 } 937 if err := os.RemoveAll(netNSPath); err != nil { 938 return fmt.Errorf("Failed to clean up namespace %s: %v", netNSPath, err) 939 } 940 941 return nil 942 } 943 944 func generateInterfacesAndRoutes(networkNS NetworkNamespace) ([]*vcTypes.Interface, []*vcTypes.Route, error) { 945 946 if networkNS.NetNsPath == "" { 947 return nil, nil, nil 948 } 949 950 var routes []*vcTypes.Route 951 var ifaces []*vcTypes.Interface 952 953 for _, endpoint := range networkNS.Endpoints { 954 955 var ipAddresses []*vcTypes.IPAddress 956 for _, addr := range endpoint.Properties().Addrs { 957 // Skip localhost interface 958 if addr.IP.IsLoopback() { 959 continue 960 } 961 962 netMask, _ := addr.Mask.Size() 963 ipAddress := vcTypes.IPAddress{ 964 Family: netlink.FAMILY_V4, 965 Address: addr.IP.String(), 966 Mask: fmt.Sprintf("%d", netMask), 967 } 968 969 if addr.IP.To4() == nil { 970 ipAddress.Family = netlink.FAMILY_V6 971 } 972 ipAddresses = append(ipAddresses, &ipAddress) 973 } 974 noarp := endpoint.Properties().Iface.RawFlags & unix.IFF_NOARP 975 ifc := vcTypes.Interface{ 976 IPAddresses: ipAddresses, 977 Device: endpoint.Name(), 978 Name: endpoint.Name(), 979 Mtu: uint64(endpoint.Properties().Iface.MTU), 980 RawFlags: noarp, 981 HwAddr: endpoint.HardwareAddr(), 982 PciAddr: endpoint.PciAddr(), 983 } 984 985 ifaces = append(ifaces, &ifc) 986 987 for _, route := range endpoint.Properties().Routes { 988 var r vcTypes.Route 989 990 if route.Protocol == unix.RTPROT_KERNEL { 991 continue 992 } 993 994 if route.Dst != nil { 995 r.Dest = route.Dst.String() 996 } 997 998 if route.Gw != nil { 999 gateway := route.Gw.String() 1000 r.Gateway = gateway 1001 } 1002 1003 if route.Src != nil { 1004 r.Source = route.Src.String() 1005 } 1006 1007 r.Device = endpoint.Name() 1008 r.Scope = uint32(route.Scope) 1009 routes = append(routes, &r) 1010 1011 } 1012 } 1013 return ifaces, routes, nil 1014 } 1015 1016 func createNetworkInterfacePair(idx int, ifName string, interworkingModel NetInterworkingModel) (NetworkInterfacePair, error) { 1017 uniqueID := uuid.Generate().String() 1018 1019 randomMacAddr, err := generateRandomPrivateMacAddr() 1020 if err != nil { 1021 return NetworkInterfacePair{}, fmt.Errorf("Could not generate random mac address: %s", err) 1022 } 1023 1024 netPair := NetworkInterfacePair{ 1025 TapInterface: TapInterface{ 1026 ID: uniqueID, 1027 Name: fmt.Sprintf("br%d_kata", idx), 1028 TAPIface: NetworkInterface{ 1029 Name: fmt.Sprintf("tap%d_kata", idx), 1030 }, 1031 }, 1032 VirtIface: NetworkInterface{ 1033 Name: fmt.Sprintf("eth%d", idx), 1034 HardAddr: randomMacAddr, 1035 }, 1036 NetInterworkingModel: interworkingModel, 1037 } 1038 1039 if ifName != "" { 1040 netPair.VirtIface.Name = ifName 1041 } 1042 1043 return netPair, nil 1044 } 1045 1046 func generateRandomPrivateMacAddr() (string, error) { 1047 buf := make([]byte, 6) 1048 _, err := cryptoRand.Read(buf) 1049 if err != nil { 1050 return "", err 1051 } 1052 1053 // Set the local bit for local addresses 1054 // Addresses in this range are local mac addresses: 1055 // x2-xx-xx-xx-xx-xx , x6-xx-xx-xx-xx-xx , xA-xx-xx-xx-xx-xx , xE-xx-xx-xx-xx-xx 1056 buf[0] = (buf[0] | 2) & 0xfe 1057 1058 hardAddr := net.HardwareAddr(buf) 1059 return hardAddr.String(), nil 1060 } 1061 1062 func networkInfoFromLink(handle *netlink.Handle, link netlink.Link) (NetworkInfo, error) { 1063 addrs, err := handle.AddrList(link, netlink.FAMILY_ALL) 1064 if err != nil { 1065 return NetworkInfo{}, err 1066 } 1067 1068 routes, err := handle.RouteList(link, netlink.FAMILY_ALL) 1069 if err != nil { 1070 return NetworkInfo{}, err 1071 } 1072 1073 return NetworkInfo{ 1074 Iface: NetlinkIface{ 1075 LinkAttrs: *(link.Attrs()), 1076 Type: link.Type(), 1077 }, 1078 Addrs: addrs, 1079 Routes: routes, 1080 }, nil 1081 } 1082 1083 func createEndpointsFromScan(networkNSPath string, config *NetworkConfig) ([]Endpoint, error) { 1084 var endpoints []Endpoint 1085 1086 netnsHandle, err := netns.GetFromPath(networkNSPath) 1087 if err != nil { 1088 return []Endpoint{}, err 1089 } 1090 defer netnsHandle.Close() 1091 1092 netlinkHandle, err := netlink.NewHandleAt(netnsHandle) 1093 if err != nil { 1094 return []Endpoint{}, err 1095 } 1096 defer netlinkHandle.Delete() 1097 1098 linkList, err := netlinkHandle.LinkList() 1099 if err != nil { 1100 return []Endpoint{}, err 1101 } 1102 1103 idx := 0 1104 for _, link := range linkList { 1105 var ( 1106 endpoint Endpoint 1107 errCreate error 1108 ) 1109 1110 netInfo, err := networkInfoFromLink(netlinkHandle, link) 1111 if err != nil { 1112 return []Endpoint{}, err 1113 } 1114 1115 // Ignore unconfigured network interfaces. These are 1116 // either base tunnel devices that are not namespaced 1117 // like gre0, gretap0, sit0, ipip0, tunl0 or incorrectly 1118 // setup interfaces. 1119 if len(netInfo.Addrs) == 0 { 1120 continue 1121 } 1122 1123 // Skip any loopback interfaces: 1124 if (netInfo.Iface.Flags & net.FlagLoopback) != 0 { 1125 continue 1126 } 1127 1128 if err := doNetNS(networkNSPath, func(_ ns.NetNS) error { 1129 endpoint, errCreate = createEndpoint(netInfo, idx, config.InterworkingModel, link) 1130 return errCreate 1131 }); err != nil { 1132 return []Endpoint{}, err 1133 } 1134 1135 endpoint.SetProperties(netInfo) 1136 endpoints = append(endpoints, endpoint) 1137 1138 idx++ 1139 } 1140 1141 sort.Slice(endpoints, func(i, j int) bool { 1142 return endpoints[i].Name() < endpoints[j].Name() 1143 }) 1144 1145 networkLogger().WithField("endpoints", endpoints).Info("Endpoints found after scan") 1146 1147 return endpoints, nil 1148 } 1149 1150 func createEndpoint(netInfo NetworkInfo, idx int, model NetInterworkingModel, link netlink.Link) (Endpoint, error) { 1151 var endpoint Endpoint 1152 // TODO: This is the incoming interface 1153 // based on the incoming interface we should create 1154 // an appropriate EndPoint based on interface type 1155 // This should be a switch 1156 1157 // Check if interface is a physical interface. Do not create 1158 // tap interface/bridge if it is. 1159 isPhysical, err := isPhysicalIface(netInfo.Iface.Name) 1160 if err != nil { 1161 return nil, err 1162 } 1163 1164 if isPhysical { 1165 networkLogger().WithField("interface", netInfo.Iface.Name).Info("Physical network interface found") 1166 endpoint, err = createPhysicalEndpoint(netInfo) 1167 } else { 1168 var socketPath string 1169 1170 // Check if this is a dummy interface which has a vhost-user socket associated with it 1171 socketPath, err = vhostUserSocketPath(netInfo) 1172 if err != nil { 1173 return nil, err 1174 } 1175 1176 if socketPath != "" { 1177 networkLogger().WithField("interface", netInfo.Iface.Name).Info("VhostUser network interface found") 1178 endpoint, err = createVhostUserEndpoint(netInfo, socketPath) 1179 } else if netInfo.Iface.Type == "macvlan" { 1180 networkLogger().Infof("macvlan interface found") 1181 endpoint, err = createBridgedMacvlanNetworkEndpoint(idx, netInfo.Iface.Name, model) 1182 } else if netInfo.Iface.Type == "macvtap" { 1183 networkLogger().Infof("macvtap interface found") 1184 endpoint, err = createMacvtapNetworkEndpoint(netInfo) 1185 } else if netInfo.Iface.Type == "tap" { 1186 networkLogger().Info("tap interface found") 1187 endpoint, err = createTapNetworkEndpoint(idx, netInfo.Iface.Name) 1188 } else if netInfo.Iface.Type == "tuntap" { 1189 if link != nil { 1190 switch link.(*netlink.Tuntap).Mode { 1191 case 0: 1192 // mount /sys/class/net to get links 1193 return nil, fmt.Errorf("Network device mode not determined correctly. Mount sysfs in caller") 1194 case 1: 1195 return nil, fmt.Errorf("tun networking device not yet supported") 1196 case 2: 1197 networkLogger().Info("tuntap tap interface found") 1198 endpoint, err = createTuntapNetworkEndpoint(idx, netInfo.Iface.Name, netInfo.Iface.HardwareAddr, model) 1199 default: 1200 return nil, fmt.Errorf("tuntap network %v mode unsupported", link.(*netlink.Tuntap).Mode) 1201 } 1202 } 1203 } else if netInfo.Iface.Type == "veth" { 1204 endpoint, err = createVethNetworkEndpoint(idx, netInfo.Iface.Name, model) 1205 } else if netInfo.Iface.Type == "ipvlan" { 1206 endpoint, err = createIPVlanNetworkEndpoint(idx, netInfo.Iface.Name) 1207 } else { 1208 return nil, fmt.Errorf("Unsupported network interface: %s", netInfo.Iface.Type) 1209 } 1210 } 1211 1212 return endpoint, err 1213 } 1214 1215 // Network is the virtcontainer network structure 1216 type Network struct { 1217 } 1218 1219 func (n *Network) trace(ctx context.Context, name string) (opentracing.Span, context.Context) { 1220 span, ct := opentracing.StartSpanFromContext(ctx, name) 1221 1222 span.SetTag("subsystem", "network") 1223 span.SetTag("type", "default") 1224 1225 return span, ct 1226 } 1227 1228 // Run runs a callback in the specified network namespace. 1229 func (n *Network) Run(networkNSPath string, cb func() error) error { 1230 span, _ := n.trace(context.Background(), "run") 1231 defer span.Finish() 1232 1233 return doNetNS(networkNSPath, func(_ ns.NetNS) error { 1234 return cb() 1235 }) 1236 } 1237 1238 // Add adds all needed interfaces inside the network namespace. 1239 func (n *Network) Add(ctx context.Context, config *NetworkConfig, hypervisor hypervisor, hotplug bool) ([]Endpoint, error) { 1240 span, _ := n.trace(ctx, "add") 1241 defer span.Finish() 1242 1243 endpoints, err := createEndpointsFromScan(config.NetNSPath, config) 1244 if err != nil { 1245 return endpoints, err 1246 } 1247 1248 err = doNetNS(config.NetNSPath, func(_ ns.NetNS) error { 1249 for _, endpoint := range endpoints { 1250 networkLogger().WithField("endpoint-type", endpoint.Type()).WithField("hotplug", hotplug).Info("Attaching endpoint") 1251 if hotplug { 1252 if err := endpoint.HotAttach(hypervisor); err != nil { 1253 return err 1254 } 1255 } else { 1256 if err := endpoint.Attach(hypervisor); err != nil { 1257 return err 1258 } 1259 } 1260 } 1261 1262 return nil 1263 }) 1264 if err != nil { 1265 return []Endpoint{}, err 1266 } 1267 1268 networkLogger().Debug("Network added") 1269 1270 return endpoints, nil 1271 } 1272 1273 func (n *Network) PostAdd(ctx context.Context, ns *NetworkNamespace, hotplug bool) error { 1274 if hotplug { 1275 return nil 1276 } 1277 1278 if ns.Endpoints == nil { 1279 return nil 1280 } 1281 1282 endpoints := ns.Endpoints 1283 1284 for _, endpoint := range endpoints { 1285 netPair := endpoint.NetworkPair() 1286 if netPair == nil { 1287 continue 1288 } 1289 if netPair.VhostFds != nil { 1290 for _, VhostFd := range netPair.VhostFds { 1291 VhostFd.Close() 1292 } 1293 } 1294 } 1295 1296 return nil 1297 } 1298 1299 // Remove network endpoints in the network namespace. It also deletes the network 1300 // namespace in case the namespace has been created by us. 1301 func (n *Network) Remove(ctx context.Context, ns *NetworkNamespace, hypervisor hypervisor) error { 1302 span, _ := n.trace(ctx, "remove") 1303 defer span.Finish() 1304 1305 for _, endpoint := range ns.Endpoints { 1306 // Detach for an endpoint should enter the network namespace 1307 // if required. 1308 networkLogger().WithField("endpoint-type", endpoint.Type()).Info("Detaching endpoint") 1309 if err := endpoint.Detach(ns.NetNsCreated, ns.NetNsPath); err != nil { 1310 return err 1311 } 1312 } 1313 1314 networkLogger().Debug("Network removed") 1315 1316 if ns.NetNsCreated { 1317 networkLogger().Infof("Network namespace %q deleted", ns.NetNsPath) 1318 return deleteNetNS(ns.NetNsPath) 1319 } 1320 1321 return nil 1322 }