github.com/k8snetworkplumbingwg/sriov-network-operator@v1.2.1-0.20240408194816-2d2e5a45d453/cmd/sriov-network-config-daemon/service.go (about)

     1  /*
     2  Copyright 2023.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8  	http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  package main
    17  
    18  import (
    19  	"errors"
    20  	"fmt"
    21  	"os"
    22  
    23  	"github.com/go-logr/logr"
    24  	"github.com/spf13/cobra"
    25  	"sigs.k8s.io/controller-runtime/pkg/log"
    26  
    27  	sriovv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
    28  	"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
    29  	"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper"
    30  	snolog "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/log"
    31  	"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms"
    32  	plugin "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins"
    33  	"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins/generic"
    34  	"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins/virtual"
    35  	"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/systemd"
    36  	"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars"
    37  	"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/version"
    38  )
    39  
    40  const (
    41  	PhasePre  = "pre"
    42  	PhasePost = "post"
    43  )
    44  
    45  var (
    46  	serviceCmd = &cobra.Command{
    47  		Use:   "service",
    48  		Short: "Starts SR-IOV service Config",
    49  		Long:  "",
    50  		RunE:  runServiceCmd,
    51  	}
    52  	phaseArg string
    53  
    54  	newGenericPluginFunc  = generic.NewGenericPlugin
    55  	newVirtualPluginFunc  = virtual.NewVirtualPlugin
    56  	newHostHelpersFunc    = helper.NewDefaultHostHelpers
    57  	newPlatformHelperFunc = platforms.NewDefaultPlatformHelper
    58  )
    59  
    60  func init() {
    61  	rootCmd.AddCommand(serviceCmd)
    62  	serviceCmd.Flags().StringVarP(&phaseArg, "phase", "p", PhasePre, fmt.Sprintf("configuration phase, supported values are: %s, %s", PhasePre, PhasePost))
    63  }
    64  
    65  // The service supports two configuration phases:
    66  // * pre(default) - before the NetworkManager or systemd-networkd
    67  // * post - after the NetworkManager or systemd-networkd
    68  // "sriov-config" systemd unit is responsible for starting the service in the "pre" phase mode.
    69  // "sriov-config-post-network" systemd unit starts the service in the "post" phase mode.
    70  // The service may use different plugins for each phase and call different initialization flows.
    71  // The "post" phase checks the completion status of the "pre" phase by reading the sriov result file.
    72  // The "pre" phase should set "InProgress" status if it succeeds or "Failed" otherwise.
    73  // If the result of the "pre" phase is different than "InProgress", then the "post" phase will not be executed
    74  // and the execution result will be forcefully set to "Failed".
    75  func runServiceCmd(cmd *cobra.Command, args []string) error {
    76  	if phaseArg != PhasePre && phaseArg != PhasePost {
    77  		return fmt.Errorf("invalid value for \"--phase\" argument, valid values are: %s, %s", PhasePre, PhasePost)
    78  	}
    79  	// init logger
    80  	snolog.InitLog()
    81  	setupLog := log.Log.WithName("sriov-config-service").WithValues("phase", phaseArg)
    82  
    83  	setupLog.V(0).Info("Starting sriov-config-service", "version", version.Version)
    84  
    85  	// Mark that we are running on host
    86  	vars.UsingSystemdMode = true
    87  	vars.InChroot = true
    88  
    89  	sriovConf, err := readConf(setupLog)
    90  	if err != nil {
    91  		return updateSriovResultErr(setupLog, phaseArg, err)
    92  	}
    93  	setupLog.V(2).Info("sriov-config-service", "config", sriovConf)
    94  	vars.DevMode = sriovConf.UnsupportedNics
    95  
    96  	if err := initSupportedNics(); err != nil {
    97  		return updateSriovResultErr(setupLog, phaseArg, fmt.Errorf("failed to initialize list of supported NIC ids: %v", err))
    98  	}
    99  
   100  	hostHelpers, err := newHostHelpersFunc()
   101  	if err != nil {
   102  		return updateSriovResultErr(setupLog, phaseArg, fmt.Errorf("failed to create hostHelpers: %v", err))
   103  	}
   104  
   105  	if phaseArg == PhasePre {
   106  		err = phasePre(setupLog, sriovConf, hostHelpers)
   107  	} else {
   108  		err = phasePost(setupLog, sriovConf, hostHelpers)
   109  	}
   110  	if err != nil {
   111  		return updateSriovResultErr(setupLog, phaseArg, err)
   112  	}
   113  	return updateSriovResultOk(setupLog, phaseArg)
   114  }
   115  
   116  func readConf(setupLog logr.Logger) (*systemd.SriovConfig, error) {
   117  	nodeStateSpec, err := systemd.ReadConfFile()
   118  	if err != nil {
   119  		if _, err := os.Stat(systemd.SriovSystemdConfigPath); !errors.Is(err, os.ErrNotExist) {
   120  			return nil, fmt.Errorf("failed to read the sriov configuration file in path %s: %v", systemd.SriovSystemdConfigPath, err)
   121  		}
   122  		setupLog.Info("configuration file not found, use default config")
   123  		nodeStateSpec = &systemd.SriovConfig{
   124  			Spec:            sriovv1.SriovNetworkNodeStateSpec{},
   125  			UnsupportedNics: false,
   126  			PlatformType:    consts.Baremetal,
   127  		}
   128  	}
   129  	return nodeStateSpec, nil
   130  }
   131  
   132  func initSupportedNics() error {
   133  	supportedNicIds, err := systemd.ReadSriovSupportedNics()
   134  	if err != nil {
   135  		return fmt.Errorf("failed to read list of supported nic ids: %v", err)
   136  	}
   137  	sriovv1.InitNicIDMapFromList(supportedNicIds)
   138  	return nil
   139  }
   140  
   141  func phasePre(setupLog logr.Logger, conf *systemd.SriovConfig, hostHelpers helper.HostHelpersInterface) error {
   142  	// make sure there is no stale result file to avoid situation when we
   143  	// read outdated info in the Post phase when the Pre silently failed (should not happen)
   144  	if err := systemd.RemoveSriovResult(); err != nil {
   145  		return fmt.Errorf("failed to remove sriov result file: %v", err)
   146  	}
   147  
   148  	_, err := hostHelpers.TryEnableRdma()
   149  	if err != nil {
   150  		setupLog.Error(err, "warning, failed to enable RDMA")
   151  	}
   152  	hostHelpers.TryEnableTun()
   153  	hostHelpers.TryEnableVhostNet()
   154  
   155  	return callPlugin(setupLog, PhasePre, conf, hostHelpers)
   156  }
   157  
   158  func phasePost(setupLog logr.Logger, conf *systemd.SriovConfig, hostHelpers helper.HostHelpersInterface) error {
   159  	setupLog.V(0).Info("check result of the Pre phase")
   160  	prePhaseResult, err := systemd.ReadSriovResult()
   161  	if err != nil {
   162  		return fmt.Errorf("failed to read result of the pre phase: %v", err)
   163  	}
   164  	if prePhaseResult.SyncStatus != consts.SyncStatusInProgress {
   165  		return fmt.Errorf("unexpected result of the pre phase: %s, syncError: %s", prePhaseResult.SyncStatus, prePhaseResult.LastSyncError)
   166  	}
   167  	setupLog.V(0).Info("Pre phase succeed, continue execution")
   168  
   169  	return callPlugin(setupLog, PhasePost, conf, hostHelpers)
   170  }
   171  
   172  func callPlugin(setupLog logr.Logger, phase string, conf *systemd.SriovConfig, hostHelpers helper.HostHelpersInterface) error {
   173  	configPlugin, err := getPlugin(setupLog, phase, conf, hostHelpers)
   174  	if err != nil {
   175  		return err
   176  	}
   177  
   178  	if configPlugin == nil {
   179  		setupLog.V(0).Info("no plugin for the platform for the current phase, skip calling", "platform", conf.PlatformType)
   180  		return nil
   181  	}
   182  
   183  	nodeState, err := getNetworkNodeState(setupLog, conf, hostHelpers)
   184  	if err != nil {
   185  		return nil
   186  	}
   187  	_, _, err = configPlugin.OnNodeStateChange(nodeState)
   188  	if err != nil {
   189  		return fmt.Errorf("failed to run OnNodeStateChange to update the plugin status %v", err)
   190  	}
   191  
   192  	if err = configPlugin.Apply(); err != nil {
   193  		return fmt.Errorf("failed to apply configuration: %v", err)
   194  	}
   195  	setupLog.V(0).Info("plugin call succeed")
   196  	return nil
   197  }
   198  
   199  func getPlugin(setupLog logr.Logger, phase string,
   200  	conf *systemd.SriovConfig, hostHelpers helper.HostHelpersInterface) (plugin.VendorPlugin, error) {
   201  	var (
   202  		configPlugin plugin.VendorPlugin
   203  		err          error
   204  	)
   205  	switch conf.PlatformType {
   206  	case consts.Baremetal:
   207  		switch phase {
   208  		case PhasePre:
   209  			configPlugin, err = newGenericPluginFunc(hostHelpers, generic.WithSkipVFConfiguration())
   210  		case PhasePost:
   211  			configPlugin, err = newGenericPluginFunc(hostHelpers)
   212  		}
   213  		if err != nil {
   214  			return nil, fmt.Errorf("failed to create generic plugin for %v", err)
   215  		}
   216  	case consts.VirtualOpenStack:
   217  		switch phase {
   218  		case PhasePre:
   219  			configPlugin, err = newVirtualPluginFunc(hostHelpers)
   220  			if err != nil {
   221  				return nil, fmt.Errorf("failed to create virtual plugin %v", err)
   222  			}
   223  		case PhasePost:
   224  			setupLog.Info("skip post configuration phase for virtual cluster")
   225  			return nil, nil
   226  		}
   227  	}
   228  	return configPlugin, nil
   229  }
   230  
   231  func getNetworkNodeState(setupLog logr.Logger, conf *systemd.SriovConfig,
   232  	hostHelpers helper.HostHelpersInterface) (*sriovv1.SriovNetworkNodeState, error) {
   233  	var (
   234  		ifaceStatuses []sriovv1.InterfaceExt
   235  		err           error
   236  	)
   237  	switch conf.PlatformType {
   238  	case consts.Baremetal:
   239  		ifaceStatuses, err = hostHelpers.DiscoverSriovDevices(hostHelpers)
   240  		if err != nil {
   241  			return nil, fmt.Errorf("failed to discover sriov devices on the host:  %v", err)
   242  		}
   243  	case consts.VirtualOpenStack:
   244  		platformHelper, err := newPlatformHelperFunc()
   245  		if err != nil {
   246  			return nil, fmt.Errorf("failed to create platformHelpers")
   247  		}
   248  		err = platformHelper.CreateOpenstackDevicesInfo()
   249  		if err != nil {
   250  			return nil, fmt.Errorf("failed to read OpenStack data: %v", err)
   251  		}
   252  		ifaceStatuses, err = platformHelper.DiscoverSriovDevicesVirtual()
   253  		if err != nil {
   254  			return nil, fmt.Errorf("failed to discover devices: %v", err)
   255  		}
   256  	}
   257  	return &sriovv1.SriovNetworkNodeState{
   258  		Spec:   conf.Spec,
   259  		Status: sriovv1.SriovNetworkNodeStateStatus{Interfaces: ifaceStatuses},
   260  	}, nil
   261  }
   262  
   263  func updateSriovResultErr(setupLog logr.Logger, phase string, origErr error) error {
   264  	setupLog.Error(origErr, "service call failed")
   265  	err := updateResult(setupLog, consts.SyncStatusFailed, fmt.Sprintf("%s: %v", phase, origErr))
   266  	if err != nil {
   267  		return err
   268  	}
   269  	return origErr
   270  }
   271  
   272  func updateSriovResultOk(setupLog logr.Logger, phase string) error {
   273  	setupLog.V(0).Info("service call succeed")
   274  	syncStatus := consts.SyncStatusSucceeded
   275  	if phase == PhasePre {
   276  		syncStatus = consts.SyncStatusInProgress
   277  	}
   278  	return updateResult(setupLog, syncStatus, "")
   279  }
   280  
   281  func updateResult(setupLog logr.Logger, result, msg string) error {
   282  	sriovResult := &systemd.SriovResult{
   283  		SyncStatus:    result,
   284  		LastSyncError: msg,
   285  	}
   286  	err := systemd.WriteSriovResult(sriovResult)
   287  	if err != nil {
   288  		setupLog.Error(err, "failed to write sriov result file", "content", *sriovResult)
   289  		return fmt.Errorf("sriov-config-service failed to write sriov result file with content %v error: %v", *sriovResult, err)
   290  	}
   291  	setupLog.V(0).Info("result file updated", "SyncStatus", sriovResult.SyncStatus, "LastSyncError", msg)
   292  	return nil
   293  }