github.com/k8snetworkplumbingwg/sriov-network-operator@v1.2.1-0.20240408194816-2d2e5a45d453/test/util/cluster/cluster.go (about) 1 package cluster 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "io" 8 "os" 9 "regexp" 10 "strings" 11 "time" 12 13 corev1 "k8s.io/api/core/v1" 14 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 15 "k8s.io/apimachinery/pkg/util/wait" 16 "k8s.io/client-go/kubernetes" 17 "k8s.io/utils/pointer" 18 runtimeclient "sigs.k8s.io/controller-runtime/pkg/client" 19 20 sriovv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" 21 "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts" 22 testclient "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/client" 23 "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/nodes" 24 "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/pod" 25 ) 26 27 // EnabledNodes provides info on sriov enabled nodes of the cluster. 28 type EnabledNodes struct { 29 Nodes []string 30 States map[string]sriovv1.SriovNetworkNodeState 31 IsSecureBootEnabled map[string]bool 32 } 33 34 var ( 35 supportedPFDrivers = []string{"mlx5_core", "i40e", "ixgbe", "ice", "igb"} 36 supportedVFDrivers = []string{"iavf", "vfio-pci", "mlx5_core", "igbvf"} 37 mlxVendorID = "15b3" 38 intelVendorID = "8086" 39 ) 40 41 // Name of environment variable to filter which decives can be discovered by FindSriovDevices and FindOneSriovDevice. 42 // The filter is a regexp matched against node names and device name in the form <node_name>:<device_name> 43 // 44 // For example, given the following devices in the cluster: 45 // 46 // worker-0:eno1 47 // worker-0:eno2 48 // worker-1:eno1 49 // worker-1:eno2 50 // worker-1:ens1f0 51 // worker-1:ens1f1 52 // 53 // Values: 54 // - `.*:eno1` matches `worker-0:eno1,worker-1:eno1` 55 // - `worker-0:eno.*` matches `worker-0:eno1,worker-0:eno2` 56 // - `worker-0:eno1|worker-1:eno2` matches `worker-0:eno1,worker-1:eno2` 57 const NodeAndDeviceNameFilterEnvVar string = "SRIOV_NODE_AND_DEVICE_NAME_FILTER" 58 59 // DiscoverSriov retrieves Sriov related information of a given cluster. 60 func DiscoverSriov(clients *testclient.ClientSet, operatorNamespace string) (*EnabledNodes, error) { 61 nodeStates, err := clients.SriovNetworkNodeStates(operatorNamespace).List(context.Background(), metav1.ListOptions{}) 62 if err != nil { 63 return nil, fmt.Errorf("failed to retrieve note states %v", err) 64 } 65 66 res := &EnabledNodes{} 67 res.States = make(map[string]sriovv1.SriovNetworkNodeState) 68 res.Nodes = make([]string, 0) 69 res.IsSecureBootEnabled = make(map[string]bool) 70 71 ss, err := nodes.MatchingOptionalSelectorState(clients, nodeStates.Items) 72 if err != nil { 73 return nil, fmt.Errorf("failed to find matching node states %v", err) 74 } 75 76 err = sriovv1.InitNicIDMapFromConfigMap(kubernetes.NewForConfigOrDie(clients.Config), operatorNamespace) 77 if err != nil { 78 return nil, fmt.Errorf("failed to InitNicIdMap %v", err) 79 } 80 81 for _, state := range ss { 82 isStable, err := stateStable(state) 83 if err != nil { 84 return nil, err 85 } 86 if !isStable { 87 return nil, fmt.Errorf("sync status still in progress") 88 } 89 90 node := state.Name 91 for _, itf := range state.Status.Interfaces { 92 if IsPFDriverSupported(itf.Driver) && sriovv1.IsSupportedDevice(itf.DeviceID) { 93 res.Nodes = append(res.Nodes, node) 94 res.States[node] = state 95 break 96 } 97 } 98 } 99 100 for _, node := range res.Nodes { 101 isSecureBootEnabled, err := GetNodeSecureBootState(clients, node, operatorNamespace) 102 if err != nil { 103 return nil, err 104 } 105 106 res.IsSecureBootEnabled[node] = isSecureBootEnabled 107 } 108 109 if len(res.Nodes) == 0 { 110 return nil, fmt.Errorf("no sriov enabled node found") 111 } 112 return res, nil 113 } 114 115 // FindOneSriovDevice retrieves a valid sriov device for the given node, filtered by `SRIOV_NODE_AND_DEVICE_NAME_FILTER` environment variable. 116 func (n *EnabledNodes) FindOneSriovDevice(node string) (*sriovv1.InterfaceExt, error) { 117 ret, err := n.FindSriovDevices(node) 118 if err != nil { 119 return nil, err 120 } 121 122 if len(ret) == 0 { 123 return nil, fmt.Errorf("unable to find sriov devices in node %s", node) 124 } 125 126 return ret[0], nil 127 } 128 129 // FindSriovDevices retrieves all valid sriov devices for the given node, filtered by `SRIOV_NODE_AND_DEVICE_NAME_FILTER` environment variable. 130 func (n *EnabledNodes) FindSriovDevices(node string) ([]*sriovv1.InterfaceExt, error) { 131 devices, err := n.FindSriovDevicesIgnoreFilters(node) 132 if err != nil { 133 return nil, err 134 } 135 136 sriovDeviceNameFilter, ok := os.LookupEnv(NodeAndDeviceNameFilterEnvVar) 137 if !ok { 138 return devices, nil 139 } 140 141 filteredDevices := []*sriovv1.InterfaceExt{} 142 for _, device := range devices { 143 match, err := regexp.MatchString(sriovDeviceNameFilter, node+":"+device.Name) 144 if err != nil { 145 return nil, err 146 } 147 148 if match { 149 filteredDevices = append(filteredDevices, device) 150 } 151 } 152 153 return filteredDevices, nil 154 } 155 156 // FindSriovDevicesIgnoreFilters retrieves all valid sriov devices for the given node. 157 func (n *EnabledNodes) FindSriovDevicesIgnoreFilters(node string) ([]*sriovv1.InterfaceExt, error) { 158 devices := []*sriovv1.InterfaceExt{} 159 s, ok := n.States[node] 160 if !ok { 161 return nil, fmt.Errorf("node %s not found", node) 162 } 163 164 for i, itf := range s.Status.Interfaces { 165 if IsPFDriverSupported(itf.Driver) && sriovv1.IsSupportedDevice(itf.DeviceID) { 166 // Skip mlx interfaces if secure boot is enabled 167 // TODO: remove this when mlx support secure boot/lockdown mode 168 if itf.Vendor == mlxVendorID && n.IsSecureBootEnabled[node] { 169 continue 170 } 171 172 // if the sriov is not enable in the kernel for intel nic the totalVF will be 0 so we skip the device 173 // That is not the case for Mellanox devices that will report 0 until we configure the sriov interfaces 174 // with the mstconfig package 175 if itf.Vendor == intelVendorID && itf.TotalVfs == 0 { 176 continue 177 } 178 179 devices = append(devices, &s.Status.Interfaces[i]) 180 } 181 } 182 return devices, nil 183 } 184 185 // FindOneSriovNodeAndDevice finds a cluster node with one SR-IOV devices respecting the `SRIOV_NODE_AND_DEVICE_NAME_FILTER` filter. 186 func (n *EnabledNodes) FindOneSriovNodeAndDevice() (string, *sriovv1.InterfaceExt, error) { 187 errs := []error{} 188 for _, node := range n.Nodes { 189 devices, err := n.FindSriovDevices(node) 190 if err != nil { 191 errs = append(errs, err) 192 continue 193 } 194 195 if len(devices) > 0 { 196 return node, devices[0], nil 197 } 198 } 199 200 return "", nil, fmt.Errorf("can't find any SR-IOV devices in cluster's nodes: %w", errors.Join(errs...)) 201 } 202 203 // FindOneVfioSriovDevice retrieves a node with a valid sriov device for vfio 204 func (n *EnabledNodes) FindOneVfioSriovDevice() (string, sriovv1.InterfaceExt) { 205 for _, node := range n.Nodes { 206 for _, nic := range n.States[node].Status.Interfaces { 207 if nic.Vendor == intelVendorID && sriovv1.IsSupportedModel(nic.Vendor, nic.DeviceID) && nic.TotalVfs != 0 { 208 return node, nic 209 } 210 } 211 } 212 return "", sriovv1.InterfaceExt{} 213 } 214 215 // FindOneMellanoxSriovDevice retrieves a valid sriov device for the given node. 216 func (n *EnabledNodes) FindOneMellanoxSriovDevice(node string) (*sriovv1.InterfaceExt, error) { 217 s, ok := n.States[node] 218 if !ok { 219 return nil, fmt.Errorf("node %s not found", node) 220 } 221 222 // return error here as mlx interfaces are not supported when secure boot is enabled 223 // TODO: remove this when mlx support secure boot/lockdown mode 224 if n.IsSecureBootEnabled[node] { 225 return nil, fmt.Errorf("secure boot is enabled on the node mellanox cards are not supported") 226 } 227 228 for _, itf := range s.Status.Interfaces { 229 if itf.Vendor == mlxVendorID && sriovv1.IsSupportedModel(itf.Vendor, itf.DeviceID) { 230 return &itf, nil 231 } 232 } 233 234 return nil, fmt.Errorf("unable to find a mellanox sriov devices in node %s", node) 235 } 236 237 // SriovStable tells if all the node states are in sync (and the cluster is ready for another round of tests) 238 func SriovStable(operatorNamespace string, clients *testclient.ClientSet) (bool, error) { 239 nodeStates, err := clients.SriovNetworkNodeStates(operatorNamespace).List(context.Background(), metav1.ListOptions{}) 240 switch err { 241 case io.ErrUnexpectedEOF: 242 return false, err 243 case nil: 244 break 245 default: 246 return false, fmt.Errorf("failed to fetch nodes state %v", err) 247 } 248 249 if len(nodeStates.Items) == 0 { 250 return false, nil 251 } 252 for _, state := range nodeStates.Items { 253 nodeReady, err := stateStable(state) 254 if err != nil { 255 return false, err 256 } 257 if !nodeReady { 258 return false, nil 259 } 260 } 261 return true, nil 262 } 263 264 func stateStable(state sriovv1.SriovNetworkNodeState) (bool, error) { 265 switch state.Status.SyncStatus { 266 case "Succeeded": 267 return true, nil 268 // When the config daemon is restarted the status will be empty 269 // This doesn't mean the config was applied 270 case "": 271 return false, nil 272 } 273 return false, nil 274 } 275 276 func IsPFDriverSupported(driver string) bool { 277 for _, supportedDriver := range supportedPFDrivers { 278 if strings.Contains(driver, supportedDriver) { 279 return true 280 } 281 } 282 return false 283 } 284 285 func IsVFDriverSupported(driver string) bool { 286 for _, supportedDriver := range supportedVFDrivers { 287 if strings.Contains(driver, supportedDriver) { 288 return true 289 } 290 } 291 return false 292 } 293 294 func IsClusterStable(clients *testclient.ClientSet) (bool, error) { 295 nodes, err := clients.CoreV1Interface.Nodes().List(context.Background(), metav1.ListOptions{}) 296 if err != nil { 297 return false, err 298 } 299 300 for _, node := range nodes.Items { 301 if node.Spec.Unschedulable { 302 return false, nil 303 } 304 } 305 306 return true, nil 307 } 308 309 // IsSingleNode validates if the environment is single node cluster 310 // This is done by checking numer of nodes, it can later be substituted by an env variable if needed 311 func IsSingleNode(clients *testclient.ClientSet) (bool, error) { 312 nodes, err := clients.CoreV1Interface.Nodes().List(context.Background(), metav1.ListOptions{}) 313 if err != nil { 314 return false, err 315 } 316 return len(nodes.Items) == 1, nil 317 } 318 319 func GetNodeDrainState(clients *testclient.ClientSet, operatorNamespace string) (bool, error) { 320 sriovOperatorConfg := &sriovv1.SriovOperatorConfig{} 321 err := clients.Get(context.TODO(), runtimeclient.ObjectKey{Name: "default", Namespace: operatorNamespace}, sriovOperatorConfg) 322 return sriovOperatorConfg.Spec.DisableDrain, err 323 } 324 325 func SetDisableNodeDrainState(clients *testclient.ClientSet, operatorNamespace string, state bool) error { 326 sriovOperatorConfg := &sriovv1.SriovOperatorConfig{} 327 err := clients.Get(context.TODO(), runtimeclient.ObjectKey{Name: "default", Namespace: operatorNamespace}, sriovOperatorConfg) 328 if err != nil { 329 return err 330 } 331 sriovOperatorConfg.Spec.DisableDrain = state 332 err = clients.Update(context.TODO(), sriovOperatorConfg) 333 if err != nil { 334 return err 335 } 336 return nil 337 } 338 339 func GetNodeSecureBootState(clients *testclient.ClientSet, nodeName, namespace string) (bool, error) { 340 podDefinition := pod.GetDefinition() 341 podDefinition = pod.RedefineWithNodeSelector(podDefinition, nodeName) 342 podDefinition = pod.RedefineAsPrivileged(podDefinition) 343 podDefinition.Namespace = namespace 344 345 volume := corev1.Volume{Name: "host", VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: "/"}}} 346 mount := corev1.VolumeMount{Name: "host", MountPath: consts.Host} 347 podDefinition = pod.RedefineWithMount(podDefinition, volume, mount) 348 created, err := clients.Pods(namespace).Create(context.Background(), podDefinition, metav1.CreateOptions{}) 349 if err != nil { 350 return false, err 351 } 352 353 defer func() { 354 err = clients.Pods(namespace).Delete(context.Background(), created.Name, metav1.DeleteOptions{GracePeriodSeconds: pointer.Int64Ptr(0)}) 355 if err != nil { 356 err = fmt.Errorf("failed to remove the check secure boot status pod for node %s: %v", nodeName, err) 357 } 358 }() 359 360 var runningPod *corev1.Pod 361 err = wait.PollImmediate(time.Second, 3*time.Minute, func() (bool, error) { 362 runningPod, err = clients.Pods(namespace).Get(context.Background(), created.Name, metav1.GetOptions{}) 363 if err != nil { 364 return false, err 365 } 366 367 if runningPod.Status.Phase != corev1.PodRunning { 368 return false, nil 369 } 370 371 return true, nil 372 }) 373 if err != nil { 374 return false, err 375 } 376 377 stdout, _, err := pod.ExecCommand(clients, runningPod, "cat", "/host/sys/kernel/security/lockdown") 378 379 if strings.Contains(stdout, "No such file or directory") { 380 return false, nil 381 } 382 if err != nil { 383 return false, err 384 } 385 386 return strings.Contains(stdout, "[integrity]") || strings.Contains(stdout, "[confidentiality]"), nil 387 } 388 389 func VirtualCluster() bool { 390 if v, exist := os.LookupEnv("CLUSTER_HAS_EMULATED_PF"); exist && v != "" { 391 return true 392 } 393 return false 394 }