github.com/k8snetworkplumbingwg/sriov-network-operator@v1.2.1-0.20240408194816-2d2e5a45d453/test/util/cluster/cluster.go (about)

     1  package cluster
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"io"
     8  	"os"
     9  	"regexp"
    10  	"strings"
    11  	"time"
    12  
    13  	corev1 "k8s.io/api/core/v1"
    14  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    15  	"k8s.io/apimachinery/pkg/util/wait"
    16  	"k8s.io/client-go/kubernetes"
    17  	"k8s.io/utils/pointer"
    18  	runtimeclient "sigs.k8s.io/controller-runtime/pkg/client"
    19  
    20  	sriovv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
    21  	"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
    22  	testclient "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/client"
    23  	"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/nodes"
    24  	"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/pod"
    25  )
    26  
    27  // EnabledNodes provides info on sriov enabled nodes of the cluster.
    28  type EnabledNodes struct {
    29  	Nodes               []string
    30  	States              map[string]sriovv1.SriovNetworkNodeState
    31  	IsSecureBootEnabled map[string]bool
    32  }
    33  
    34  var (
    35  	supportedPFDrivers = []string{"mlx5_core", "i40e", "ixgbe", "ice", "igb"}
    36  	supportedVFDrivers = []string{"iavf", "vfio-pci", "mlx5_core", "igbvf"}
    37  	mlxVendorID        = "15b3"
    38  	intelVendorID      = "8086"
    39  )
    40  
    41  // Name of environment variable to filter which decives can be discovered by FindSriovDevices and FindOneSriovDevice.
    42  // The filter is a regexp matched against node names and device name in the form <node_name>:<device_name>
    43  //
    44  // For example, given the following devices in the cluster:
    45  //
    46  // worker-0:eno1
    47  // worker-0:eno2
    48  // worker-1:eno1
    49  // worker-1:eno2
    50  // worker-1:ens1f0
    51  // worker-1:ens1f1
    52  //
    53  // Values:
    54  // - `.*:eno1` matches `worker-0:eno1,worker-1:eno1`
    55  // - `worker-0:eno.*` matches `worker-0:eno1,worker-0:eno2`
    56  // - `worker-0:eno1|worker-1:eno2` matches `worker-0:eno1,worker-1:eno2`
    57  const NodeAndDeviceNameFilterEnvVar string = "SRIOV_NODE_AND_DEVICE_NAME_FILTER"
    58  
    59  // DiscoverSriov retrieves Sriov related information of a given cluster.
    60  func DiscoverSriov(clients *testclient.ClientSet, operatorNamespace string) (*EnabledNodes, error) {
    61  	nodeStates, err := clients.SriovNetworkNodeStates(operatorNamespace).List(context.Background(), metav1.ListOptions{})
    62  	if err != nil {
    63  		return nil, fmt.Errorf("failed to retrieve note states %v", err)
    64  	}
    65  
    66  	res := &EnabledNodes{}
    67  	res.States = make(map[string]sriovv1.SriovNetworkNodeState)
    68  	res.Nodes = make([]string, 0)
    69  	res.IsSecureBootEnabled = make(map[string]bool)
    70  
    71  	ss, err := nodes.MatchingOptionalSelectorState(clients, nodeStates.Items)
    72  	if err != nil {
    73  		return nil, fmt.Errorf("failed to find matching node states %v", err)
    74  	}
    75  
    76  	err = sriovv1.InitNicIDMapFromConfigMap(kubernetes.NewForConfigOrDie(clients.Config), operatorNamespace)
    77  	if err != nil {
    78  		return nil, fmt.Errorf("failed to InitNicIdMap %v", err)
    79  	}
    80  
    81  	for _, state := range ss {
    82  		isStable, err := stateStable(state)
    83  		if err != nil {
    84  			return nil, err
    85  		}
    86  		if !isStable {
    87  			return nil, fmt.Errorf("sync status still in progress")
    88  		}
    89  
    90  		node := state.Name
    91  		for _, itf := range state.Status.Interfaces {
    92  			if IsPFDriverSupported(itf.Driver) && sriovv1.IsSupportedDevice(itf.DeviceID) {
    93  				res.Nodes = append(res.Nodes, node)
    94  				res.States[node] = state
    95  				break
    96  			}
    97  		}
    98  	}
    99  
   100  	for _, node := range res.Nodes {
   101  		isSecureBootEnabled, err := GetNodeSecureBootState(clients, node, operatorNamespace)
   102  		if err != nil {
   103  			return nil, err
   104  		}
   105  
   106  		res.IsSecureBootEnabled[node] = isSecureBootEnabled
   107  	}
   108  
   109  	if len(res.Nodes) == 0 {
   110  		return nil, fmt.Errorf("no sriov enabled node found")
   111  	}
   112  	return res, nil
   113  }
   114  
   115  // FindOneSriovDevice retrieves a valid sriov device for the given node, filtered by `SRIOV_NODE_AND_DEVICE_NAME_FILTER` environment variable.
   116  func (n *EnabledNodes) FindOneSriovDevice(node string) (*sriovv1.InterfaceExt, error) {
   117  	ret, err := n.FindSriovDevices(node)
   118  	if err != nil {
   119  		return nil, err
   120  	}
   121  
   122  	if len(ret) == 0 {
   123  		return nil, fmt.Errorf("unable to find sriov devices in node %s", node)
   124  	}
   125  
   126  	return ret[0], nil
   127  }
   128  
   129  // FindSriovDevices retrieves all valid sriov devices for the given node, filtered by `SRIOV_NODE_AND_DEVICE_NAME_FILTER` environment variable.
   130  func (n *EnabledNodes) FindSriovDevices(node string) ([]*sriovv1.InterfaceExt, error) {
   131  	devices, err := n.FindSriovDevicesIgnoreFilters(node)
   132  	if err != nil {
   133  		return nil, err
   134  	}
   135  
   136  	sriovDeviceNameFilter, ok := os.LookupEnv(NodeAndDeviceNameFilterEnvVar)
   137  	if !ok {
   138  		return devices, nil
   139  	}
   140  
   141  	filteredDevices := []*sriovv1.InterfaceExt{}
   142  	for _, device := range devices {
   143  		match, err := regexp.MatchString(sriovDeviceNameFilter, node+":"+device.Name)
   144  		if err != nil {
   145  			return nil, err
   146  		}
   147  
   148  		if match {
   149  			filteredDevices = append(filteredDevices, device)
   150  		}
   151  	}
   152  
   153  	return filteredDevices, nil
   154  }
   155  
   156  // FindSriovDevicesIgnoreFilters retrieves all valid sriov devices for the given node.
   157  func (n *EnabledNodes) FindSriovDevicesIgnoreFilters(node string) ([]*sriovv1.InterfaceExt, error) {
   158  	devices := []*sriovv1.InterfaceExt{}
   159  	s, ok := n.States[node]
   160  	if !ok {
   161  		return nil, fmt.Errorf("node %s not found", node)
   162  	}
   163  
   164  	for i, itf := range s.Status.Interfaces {
   165  		if IsPFDriverSupported(itf.Driver) && sriovv1.IsSupportedDevice(itf.DeviceID) {
   166  			// Skip mlx interfaces if secure boot is enabled
   167  			// TODO: remove this when mlx support secure boot/lockdown mode
   168  			if itf.Vendor == mlxVendorID && n.IsSecureBootEnabled[node] {
   169  				continue
   170  			}
   171  
   172  			// if the sriov is not enable in the kernel for intel nic the totalVF will be 0 so we skip the device
   173  			// That is not the case for Mellanox devices that will report 0 until we configure the sriov interfaces
   174  			// with the mstconfig package
   175  			if itf.Vendor == intelVendorID && itf.TotalVfs == 0 {
   176  				continue
   177  			}
   178  
   179  			devices = append(devices, &s.Status.Interfaces[i])
   180  		}
   181  	}
   182  	return devices, nil
   183  }
   184  
   185  // FindOneSriovNodeAndDevice finds a cluster node with one SR-IOV devices respecting the `SRIOV_NODE_AND_DEVICE_NAME_FILTER` filter.
   186  func (n *EnabledNodes) FindOneSriovNodeAndDevice() (string, *sriovv1.InterfaceExt, error) {
   187  	errs := []error{}
   188  	for _, node := range n.Nodes {
   189  		devices, err := n.FindSriovDevices(node)
   190  		if err != nil {
   191  			errs = append(errs, err)
   192  			continue
   193  		}
   194  
   195  		if len(devices) > 0 {
   196  			return node, devices[0], nil
   197  		}
   198  	}
   199  
   200  	return "", nil, fmt.Errorf("can't find any SR-IOV devices in cluster's nodes: %w", errors.Join(errs...))
   201  }
   202  
   203  // FindOneVfioSriovDevice retrieves a node with a valid sriov device for vfio
   204  func (n *EnabledNodes) FindOneVfioSriovDevice() (string, sriovv1.InterfaceExt) {
   205  	for _, node := range n.Nodes {
   206  		for _, nic := range n.States[node].Status.Interfaces {
   207  			if nic.Vendor == intelVendorID && sriovv1.IsSupportedModel(nic.Vendor, nic.DeviceID) && nic.TotalVfs != 0 {
   208  				return node, nic
   209  			}
   210  		}
   211  	}
   212  	return "", sriovv1.InterfaceExt{}
   213  }
   214  
   215  // FindOneMellanoxSriovDevice retrieves a valid sriov device for the given node.
   216  func (n *EnabledNodes) FindOneMellanoxSriovDevice(node string) (*sriovv1.InterfaceExt, error) {
   217  	s, ok := n.States[node]
   218  	if !ok {
   219  		return nil, fmt.Errorf("node %s not found", node)
   220  	}
   221  
   222  	// return error here as mlx interfaces are not supported when secure boot is enabled
   223  	// TODO: remove this when mlx support secure boot/lockdown mode
   224  	if n.IsSecureBootEnabled[node] {
   225  		return nil, fmt.Errorf("secure boot is enabled on the node mellanox cards are not supported")
   226  	}
   227  
   228  	for _, itf := range s.Status.Interfaces {
   229  		if itf.Vendor == mlxVendorID && sriovv1.IsSupportedModel(itf.Vendor, itf.DeviceID) {
   230  			return &itf, nil
   231  		}
   232  	}
   233  
   234  	return nil, fmt.Errorf("unable to find a mellanox sriov devices in node %s", node)
   235  }
   236  
   237  // SriovStable tells if all the node states are in sync (and the cluster is ready for another round of tests)
   238  func SriovStable(operatorNamespace string, clients *testclient.ClientSet) (bool, error) {
   239  	nodeStates, err := clients.SriovNetworkNodeStates(operatorNamespace).List(context.Background(), metav1.ListOptions{})
   240  	switch err {
   241  	case io.ErrUnexpectedEOF:
   242  		return false, err
   243  	case nil:
   244  		break
   245  	default:
   246  		return false, fmt.Errorf("failed to fetch nodes state %v", err)
   247  	}
   248  
   249  	if len(nodeStates.Items) == 0 {
   250  		return false, nil
   251  	}
   252  	for _, state := range nodeStates.Items {
   253  		nodeReady, err := stateStable(state)
   254  		if err != nil {
   255  			return false, err
   256  		}
   257  		if !nodeReady {
   258  			return false, nil
   259  		}
   260  	}
   261  	return true, nil
   262  }
   263  
   264  func stateStable(state sriovv1.SriovNetworkNodeState) (bool, error) {
   265  	switch state.Status.SyncStatus {
   266  	case "Succeeded":
   267  		return true, nil
   268  	// When the config daemon is restarted the status will be empty
   269  	// This doesn't mean the config was applied
   270  	case "":
   271  		return false, nil
   272  	}
   273  	return false, nil
   274  }
   275  
   276  func IsPFDriverSupported(driver string) bool {
   277  	for _, supportedDriver := range supportedPFDrivers {
   278  		if strings.Contains(driver, supportedDriver) {
   279  			return true
   280  		}
   281  	}
   282  	return false
   283  }
   284  
   285  func IsVFDriverSupported(driver string) bool {
   286  	for _, supportedDriver := range supportedVFDrivers {
   287  		if strings.Contains(driver, supportedDriver) {
   288  			return true
   289  		}
   290  	}
   291  	return false
   292  }
   293  
   294  func IsClusterStable(clients *testclient.ClientSet) (bool, error) {
   295  	nodes, err := clients.CoreV1Interface.Nodes().List(context.Background(), metav1.ListOptions{})
   296  	if err != nil {
   297  		return false, err
   298  	}
   299  
   300  	for _, node := range nodes.Items {
   301  		if node.Spec.Unschedulable {
   302  			return false, nil
   303  		}
   304  	}
   305  
   306  	return true, nil
   307  }
   308  
   309  // IsSingleNode validates if the environment is single node cluster
   310  // This is done by checking numer of nodes, it can later be substituted by an env variable if needed
   311  func IsSingleNode(clients *testclient.ClientSet) (bool, error) {
   312  	nodes, err := clients.CoreV1Interface.Nodes().List(context.Background(), metav1.ListOptions{})
   313  	if err != nil {
   314  		return false, err
   315  	}
   316  	return len(nodes.Items) == 1, nil
   317  }
   318  
   319  func GetNodeDrainState(clients *testclient.ClientSet, operatorNamespace string) (bool, error) {
   320  	sriovOperatorConfg := &sriovv1.SriovOperatorConfig{}
   321  	err := clients.Get(context.TODO(), runtimeclient.ObjectKey{Name: "default", Namespace: operatorNamespace}, sriovOperatorConfg)
   322  	return sriovOperatorConfg.Spec.DisableDrain, err
   323  }
   324  
   325  func SetDisableNodeDrainState(clients *testclient.ClientSet, operatorNamespace string, state bool) error {
   326  	sriovOperatorConfg := &sriovv1.SriovOperatorConfig{}
   327  	err := clients.Get(context.TODO(), runtimeclient.ObjectKey{Name: "default", Namespace: operatorNamespace}, sriovOperatorConfg)
   328  	if err != nil {
   329  		return err
   330  	}
   331  	sriovOperatorConfg.Spec.DisableDrain = state
   332  	err = clients.Update(context.TODO(), sriovOperatorConfg)
   333  	if err != nil {
   334  		return err
   335  	}
   336  	return nil
   337  }
   338  
   339  func GetNodeSecureBootState(clients *testclient.ClientSet, nodeName, namespace string) (bool, error) {
   340  	podDefinition := pod.GetDefinition()
   341  	podDefinition = pod.RedefineWithNodeSelector(podDefinition, nodeName)
   342  	podDefinition = pod.RedefineAsPrivileged(podDefinition)
   343  	podDefinition.Namespace = namespace
   344  
   345  	volume := corev1.Volume{Name: "host", VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: "/"}}}
   346  	mount := corev1.VolumeMount{Name: "host", MountPath: consts.Host}
   347  	podDefinition = pod.RedefineWithMount(podDefinition, volume, mount)
   348  	created, err := clients.Pods(namespace).Create(context.Background(), podDefinition, metav1.CreateOptions{})
   349  	if err != nil {
   350  		return false, err
   351  	}
   352  
   353  	defer func() {
   354  		err = clients.Pods(namespace).Delete(context.Background(), created.Name, metav1.DeleteOptions{GracePeriodSeconds: pointer.Int64Ptr(0)})
   355  		if err != nil {
   356  			err = fmt.Errorf("failed to remove the check secure boot status pod for node %s: %v", nodeName, err)
   357  		}
   358  	}()
   359  
   360  	var runningPod *corev1.Pod
   361  	err = wait.PollImmediate(time.Second, 3*time.Minute, func() (bool, error) {
   362  		runningPod, err = clients.Pods(namespace).Get(context.Background(), created.Name, metav1.GetOptions{})
   363  		if err != nil {
   364  			return false, err
   365  		}
   366  
   367  		if runningPod.Status.Phase != corev1.PodRunning {
   368  			return false, nil
   369  		}
   370  
   371  		return true, nil
   372  	})
   373  	if err != nil {
   374  		return false, err
   375  	}
   376  
   377  	stdout, _, err := pod.ExecCommand(clients, runningPod, "cat", "/host/sys/kernel/security/lockdown")
   378  
   379  	if strings.Contains(stdout, "No such file or directory") {
   380  		return false, nil
   381  	}
   382  	if err != nil {
   383  		return false, err
   384  	}
   385  
   386  	return strings.Contains(stdout, "[integrity]") || strings.Contains(stdout, "[confidentiality]"), nil
   387  }
   388  
   389  func VirtualCluster() bool {
   390  	if v, exist := os.LookupEnv("CLUSTER_HAS_EMULATED_PF"); exist && v != "" {
   391  		return true
   392  	}
   393  	return false
   394  }