github.com/mirantis/virtlet@v1.5.2-0.20191204181327-1659b8a48e9b/pkg/tapmanager/tapfdsource.go (about)

     1  /*
     2  Copyright 2017 Mirantis
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package tapmanager
    18  
    19  import (
    20  	"encoding/json"
    21  	"fmt"
    22  	"net"
    23  	"strings"
    24  	"sync"
    25  	"time"
    26  
    27  	"github.com/containernetworking/cni/pkg/ns"
    28  	cnitypes "github.com/containernetworking/cni/pkg/types"
    29  	cnicurrent "github.com/containernetworking/cni/pkg/types/current"
    30  	"github.com/davecgh/go-spew/spew"
    31  	"github.com/golang/glog"
    32  	"github.com/vishvananda/netlink"
    33  
    34  	"github.com/Mirantis/virtlet/pkg/cni"
    35  	"github.com/Mirantis/virtlet/pkg/dhcp"
    36  	"github.com/Mirantis/virtlet/pkg/nettools"
    37  	"github.com/Mirantis/virtlet/pkg/network"
    38  	"github.com/Mirantis/virtlet/pkg/utils"
    39  )
    40  
    41  const (
    42  	calicoDefaultSubnet = 24
    43  	calicoSubnetVar     = "VIRTLET_CALICO_SUBNET"
    44  )
    45  
    46  // InterfaceDescription contains interface type with additional data
    47  // needed to identify it
    48  type InterfaceDescription struct {
    49  	Type         network.InterfaceType `json:"type"`
    50  	HardwareAddr net.HardwareAddr      `json:"mac"`
    51  	FdIndex      int                   `json:"fdIndex"`
    52  	PCIAddress   string                `json:"pciAddress"`
    53  }
    54  
    55  // PodNetworkDesc contains the data that are required by TapFDSource
    56  // to set up a tap device for a VM
    57  type PodNetworkDesc struct {
    58  	// PodID specifies the id of the pod
    59  	PodID string `json:"podId"`
    60  	// PodNs specifies the namespace of the pod
    61  	PodNs string `json:"podNs"`
    62  	// PodName specifies the name of the pod
    63  	PodName string `json:"podName"`
    64  	// DNS specifies DNS settings for the pod
    65  	DNS *cnitypes.DNS
    66  }
    67  
    68  // GetFDPayload contains the data that are required by TapFDSource
    69  // to prepare container side network configuration
    70  type GetFDPayload struct {
    71  	// Description contains the pod information and DNS settings for the pod
    72  	Description *PodNetworkDesc `json:"podNetworkDesc"`
    73  }
    74  
    75  // RecoverPayload contains the data that are required by TapFDSource
    76  // to recover a network configuration in a pod
    77  type RecoverPayload struct {
    78  	// Description contains the pod information and DNS settings for the pod
    79  	Description *PodNetworkDesc `json:"podNetworkDesc"`
    80  	// ContainerSideNetwork specifies configuration used to configure retaken
    81  	// environment
    82  	ContainerSideNetwork *network.ContainerSideNetwork `json:"csn"`
    83  	// HaveRunningContainers is true if any domains are currently running
    84  	// for this pod. VF reconfiguration is to be skipped if that's the case.
    85  	HaveRunningContainers bool
    86  }
    87  
    88  type podNetwork struct {
    89  	pnd        PodNetworkDesc
    90  	csn        *network.ContainerSideNetwork
    91  	dhcpServer *dhcp.Server
    92  	doneCh     chan error
    93  }
    94  
    95  // TapFDSource sets up and tears down Virtlet VM network.
    96  // It implements FDSource interface
    97  type TapFDSource struct {
    98  	sync.Mutex
    99  
   100  	cniClient          cni.Client
   101  	dummyNetwork       *cnicurrent.Result
   102  	dummyNetworkNsPath string
   103  	fdMap              map[string]*podNetwork
   104  	enableSriov        bool
   105  	calicoSubnetSize   int
   106  }
   107  
   108  var _ FDSource = &TapFDSource{}
   109  
   110  // NewTapFDSource returns a TapFDSource for the specified CNI plugin &
   111  // config dir
   112  func NewTapFDSource(cniClient cni.Client, enableSriov bool, calicoSubnetSize int) (*TapFDSource, error) {
   113  	s := &TapFDSource{
   114  		cniClient:        cniClient,
   115  		fdMap:            make(map[string]*podNetwork),
   116  		calicoSubnetSize: calicoSubnetSize,
   117  		enableSriov:      enableSriov,
   118  	}
   119  
   120  	return s, nil
   121  }
   122  
   123  func (s *TapFDSource) getDummyNetwork() (*cnicurrent.Result, string, error) {
   124  	if s.dummyNetwork == nil {
   125  		var err error
   126  		s.dummyNetwork, s.dummyNetworkNsPath, err = s.cniClient.GetDummyNetwork()
   127  		if err != nil {
   128  			return nil, "", err
   129  		}
   130  		// s.dummyGateway = dummyResult.IPs[0].Address.IP
   131  
   132  	}
   133  	return s.dummyNetwork, s.dummyNetworkNsPath, nil
   134  }
   135  
   136  // GetFDs implements GetFDs method of FDSource interface
   137  func (s *TapFDSource) GetFDs(key string, data []byte) ([]int, []byte, error) {
   138  	var payload GetFDPayload
   139  	if err := json.Unmarshal(data, &payload); err != nil {
   140  		return nil, nil, fmt.Errorf("error unmarshalling GetFD payload: %v", err)
   141  	}
   142  	pnd := payload.Description
   143  	if err := cni.CreateNetNS(pnd.PodID); err != nil {
   144  		return nil, nil, fmt.Errorf("error creating new netns for pod %s (%s): %v", pnd.PodName, pnd.PodID, err)
   145  	}
   146  
   147  	gotError := false
   148  	podAddedToNetwork := false
   149  	defer func() {
   150  		if gotError {
   151  			if podAddedToNetwork {
   152  				if err := s.cniClient.RemoveSandboxFromNetwork(pnd.PodID, pnd.PodName, pnd.PodNs); err != nil {
   153  					glog.Errorf("Error removing a pod from the pod network after failed network setup: %v", err)
   154  				}
   155  			}
   156  			if err := cni.DestroyNetNS(pnd.PodID); err != nil {
   157  				glog.Errorf("Error removing netns after failed network setup: %v", err)
   158  			}
   159  		}
   160  	}()
   161  
   162  	netConfig, err := s.cniClient.AddSandboxToNetwork(pnd.PodID, pnd.PodName, pnd.PodNs)
   163  	if err != nil {
   164  		gotError = true
   165  		return nil, nil, fmt.Errorf("error adding pod %s (%s) to CNI network: %v", pnd.PodName, pnd.PodID, err)
   166  	}
   167  	podAddedToNetwork = true
   168  	glog.V(3).Infof("CNI configuration for pod %s (%s): %s", pnd.PodName, pnd.PodID, spew.Sdump(netConfig))
   169  
   170  	if netConfig == nil {
   171  		netConfig = &cnicurrent.Result{}
   172  	}
   173  
   174  	if payload.Description.DNS != nil {
   175  		netConfig.DNS.Nameservers = pnd.DNS.Nameservers
   176  		netConfig.DNS.Search = pnd.DNS.Search
   177  		netConfig.DNS.Options = pnd.DNS.Options
   178  	}
   179  
   180  	var fds []int
   181  	var respData []byte
   182  	var csn *network.ContainerSideNetwork
   183  	if err := s.setupNetNS(key, pnd, func(netNSPath string, allLinks []netlink.Link, hostNS ns.NetNS) (*network.ContainerSideNetwork, error) {
   184  		if netConfig, err = nettools.ValidateAndFixCNIResult(netConfig, netNSPath, allLinks); err != nil {
   185  			gotError = true
   186  			return nil, fmt.Errorf("error fixing cni configuration: %v", err)
   187  		}
   188  		if err := nettools.FixCalicoNetworking(netConfig, s.calicoSubnetSize, s.getDummyNetwork); err != nil {
   189  			// don't fail in this case because there may be even no Calico
   190  			glog.Warningf("Calico detection/fix didn't work: %v", err)
   191  		}
   192  		glog.V(3).Infof("CNI Result after fix:\n%s", spew.Sdump(netConfig))
   193  
   194  		var err error
   195  		if csn, err = nettools.SetupContainerSideNetwork(netConfig, netNSPath, allLinks, s.enableSriov, hostNS); err != nil {
   196  			return nil, err
   197  		}
   198  
   199  		if respData, err = json.Marshal(csn); err != nil {
   200  			return nil, fmt.Errorf("error marshalling net config: %v", err)
   201  		}
   202  
   203  		for _, i := range csn.Interfaces {
   204  			fds = append(fds, int(i.Fo.Fd()))
   205  		}
   206  		return csn, nil
   207  	}); err != nil {
   208  		gotError = true
   209  		return nil, nil, err
   210  	}
   211  
   212  	return fds, respData, nil
   213  }
   214  
   215  // Release implements Release method of FDSource interface
   216  func (s *TapFDSource) Release(key string) error {
   217  	s.Lock()
   218  	defer s.Unlock()
   219  	pn, found := s.fdMap[key]
   220  	if !found {
   221  		return fmt.Errorf("bad fd key: %q", key)
   222  	}
   223  
   224  	netNSPath := cni.PodNetNSPath(pn.pnd.PodID)
   225  
   226  	vmNS, err := ns.GetNS(netNSPath)
   227  	if err != nil {
   228  		return fmt.Errorf("failed to open network namespace at %q: %v", netNSPath, err)
   229  	}
   230  
   231  	// Try to keep this function idempotent even if there are errors during the following calls.
   232  	// This can cause some resource leaks in multiple CNI case but makes it possible
   233  	// to call `RunPodSandbox` again after a failed attempt. Failing to do so would cause
   234  	// the next `RunPodSandbox` call to fail due to the netns already being present.
   235  	defer func() {
   236  		if err := cni.DestroyNetNS(pn.pnd.PodID); err != nil {
   237  			glog.Errorf("Error when removing network namespace for pod sandbox %q: %v", pn.pnd.PodID, err)
   238  		}
   239  	}()
   240  
   241  	if err := nettools.ReconstructVFs(pn.csn, vmNS, false); err != nil {
   242  		return fmt.Errorf("failed to reconstruct SR-IOV devices: %v", err)
   243  	}
   244  
   245  	if err := vmNS.Do(func(ns.NetNS) error {
   246  		if err := pn.dhcpServer.Close(); err != nil {
   247  			return fmt.Errorf("failed to stop dhcp server: %v", err)
   248  		}
   249  		<-pn.doneCh
   250  		return nettools.Teardown(pn.csn)
   251  	}); err != nil {
   252  		return err
   253  	}
   254  
   255  	if err := s.cniClient.RemoveSandboxFromNetwork(pn.pnd.PodID, pn.pnd.PodName, pn.pnd.PodNs); err != nil {
   256  		return fmt.Errorf("error removing pod sandbox %q from CNI network: %v", pn.pnd.PodID, err)
   257  	}
   258  
   259  	delete(s.fdMap, key)
   260  	return nil
   261  }
   262  
   263  // GetInfo implements GetInfo method of FDSource interface
   264  func (s *TapFDSource) GetInfo(key string) ([]byte, error) {
   265  	s.Lock()
   266  	defer s.Unlock()
   267  	pn, found := s.fdMap[key]
   268  	if !found {
   269  		return nil, fmt.Errorf("bad fd key: %q", key)
   270  	}
   271  	var descriptions []InterfaceDescription
   272  	for i, iface := range pn.csn.Interfaces {
   273  		descriptions = append(descriptions, InterfaceDescription{
   274  			FdIndex:      i,
   275  			HardwareAddr: iface.HardwareAddr,
   276  			Type:         iface.Type,
   277  			PCIAddress:   iface.PCIAddress,
   278  		})
   279  	}
   280  	data, err := json.Marshal(descriptions)
   281  	if err != nil {
   282  		return nil, fmt.Errorf("interface descriptions marshaling error: %v", err)
   283  	}
   284  	return data, nil
   285  }
   286  
   287  // Stop stops any running DHCP servers associated with TapFDSource
   288  // and closes tap fds without releasing any other resources.
   289  func (s *TapFDSource) Stop() error {
   290  	s.Lock()
   291  	defer s.Unlock()
   292  	var errors []string
   293  	for _, pn := range s.fdMap {
   294  		if err := pn.dhcpServer.Close(); err != nil {
   295  			errors = append(errors, fmt.Sprintf("error stopping dhcp server: %v", err.Error()))
   296  		} else {
   297  			<-pn.doneCh
   298  		}
   299  		for _, i := range pn.csn.Interfaces {
   300  			if err := i.Fo.Close(); err != nil {
   301  				errors = append(errors, fmt.Sprintf("error closing tap fd: %v", err))
   302  			}
   303  		}
   304  	}
   305  	s.fdMap = make(map[string]*podNetwork)
   306  	if errors != nil {
   307  		return fmt.Errorf("Errors while stopping TapFDSource:\n%s", strings.Join(errors, "\n"))
   308  	}
   309  	return nil
   310  }
   311  
   312  // Recover recovers the state for the netns after Virtlet restart
   313  func (s *TapFDSource) Recover(key string, data []byte) error {
   314  	var payload RecoverPayload
   315  	if err := json.Unmarshal(data, &payload); err != nil {
   316  		return fmt.Errorf("error unmarshalling GetFD payload: %v", err)
   317  	}
   318  	pnd := payload.Description
   319  	csn := payload.ContainerSideNetwork
   320  	if csn == nil {
   321  		return fmt.Errorf("ContainerSideNetwork not passed to Recover()")
   322  	}
   323  	if csn.Result == nil {
   324  		csn.Result = &cnicurrent.Result{}
   325  	}
   326  	netNSPath := cni.PodNetNSPath(pnd.PodID)
   327  	vmNS, err := ns.GetNS(netNSPath)
   328  	if err != nil {
   329  		return fmt.Errorf("failed to open network namespace at %q: %v", netNSPath, err)
   330  	}
   331  	if !payload.HaveRunningContainers {
   332  		if err := nettools.ReconstructVFs(csn, vmNS, true); err != nil {
   333  			return err
   334  		}
   335  	}
   336  	return s.setupNetNS(key, pnd, func(netNSPath string, allLinks []netlink.Link, hostNS ns.NetNS) (*network.ContainerSideNetwork, error) {
   337  		if err := nettools.RecoverContainerSideNetwork(csn, netNSPath, allLinks, hostNS); err != nil {
   338  			return nil, err
   339  		}
   340  		return csn, nil
   341  	})
   342  }
   343  
   344  // RetrieveFDs retrieves the FDs.
   345  // It's only used in case if VM exited but Recover() didn't populate the FDs
   346  func (s *TapFDSource) RetrieveFDs(key string) ([]int, error) {
   347  	var podNet *podNetwork
   348  	var fds []int
   349  	func() {
   350  		s.Lock()
   351  		defer s.Unlock()
   352  		podNet = s.fdMap[key]
   353  	}()
   354  	if podNet == nil {
   355  		return nil, fmt.Errorf("bad key %q to retrieve FDs", key)
   356  	}
   357  
   358  	netNSPath := cni.PodNetNSPath(podNet.pnd.PodID)
   359  	vmNS, err := ns.GetNS(netNSPath)
   360  	if err != nil {
   361  		return nil, fmt.Errorf("failed to open network namespace at %q: %v", netNSPath, err)
   362  	}
   363  
   364  	if err := utils.CallInNetNSWithSysfsRemounted(vmNS, func(hostNS ns.NetNS) error {
   365  		allLinks, err := netlink.LinkList()
   366  		if err != nil {
   367  			return fmt.Errorf("error listing the links: %v", err)
   368  		}
   369  
   370  		return nettools.RecoverContainerSideNetwork(podNet.csn, netNSPath, allLinks, hostNS)
   371  	}); err != nil {
   372  		return nil, err
   373  	}
   374  
   375  	for _, ifDesc := range podNet.csn.Interfaces {
   376  		// Fail if not all succeeded
   377  		if ifDesc.Fo == nil {
   378  			return nil, fmt.Errorf("failed to open tap interface %q", ifDesc.Name)
   379  		}
   380  		fds = append(fds, int(ifDesc.Fo.Fd()))
   381  	}
   382  	return fds, nil
   383  }
   384  
   385  func (s *TapFDSource) setupNetNS(key string, pnd *PodNetworkDesc, initNet func(netNSPath string, allLinks []netlink.Link, hostNS ns.NetNS) (*network.ContainerSideNetwork, error)) error {
   386  	netNSPath := cni.PodNetNSPath(pnd.PodID)
   387  	vmNS, err := ns.GetNS(netNSPath)
   388  	if err != nil {
   389  		return fmt.Errorf("failed to open network namespace at %q: %v", netNSPath, err)
   390  	}
   391  
   392  	var csn *network.ContainerSideNetwork
   393  	var dhcpServer *dhcp.Server
   394  	doneCh := make(chan error)
   395  	if err := utils.CallInNetNSWithSysfsRemounted(vmNS, func(hostNS ns.NetNS) error {
   396  		allLinks, err := netlink.LinkList()
   397  		if err != nil {
   398  			return fmt.Errorf("error listing the links: %v", err)
   399  		}
   400  
   401  		if csn, err = initNet(netNSPath, allLinks, hostNS); err != nil {
   402  			return err
   403  		}
   404  
   405  		dhcpServer = dhcp.NewServer(csn)
   406  		if err := dhcpServer.SetupListener("0.0.0.0"); err != nil {
   407  			return fmt.Errorf("Failed to set up dhcp listener: %v", err)
   408  		}
   409  		go func() {
   410  			doneCh <- vmNS.Do(func(ns.NetNS) error {
   411  				err := dhcpServer.Serve()
   412  				if err != nil {
   413  					glog.Errorf("dhcp server error: %v", err)
   414  				}
   415  				return err
   416  			})
   417  		}()
   418  
   419  		// FIXME: there's some very small possibility for a race here
   420  		// (happens if the VM makes DHCP request before DHCP server is ready)
   421  		// For now, let's make the probability of such problem even smaller
   422  		time.Sleep(500 * time.Millisecond)
   423  		return nil
   424  	}); err != nil {
   425  		return err
   426  	}
   427  
   428  	s.Lock()
   429  	defer s.Unlock()
   430  	s.fdMap[key] = &podNetwork{
   431  		pnd:        *pnd,
   432  		csn:        csn,
   433  		dhcpServer: dhcpServer,
   434  		doneCh:     doneCh,
   435  	}
   436  	return nil
   437  }