github.com/k8snetworkplumbingwg/sriov-network-operator@v1.2.1-0.20240408194816-2d2e5a45d453/cmd/sriov-network-config-daemon/start.go (about)

     1  /*
     2  Copyright 2023.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8  	http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  package main
    17  
    18  import (
    19  	"context"
    20  	"fmt"
    21  	"net"
    22  	"net/url"
    23  	"os"
    24  	"strings"
    25  	"time"
    26  
    27  	"github.com/spf13/cobra"
    28  	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	"k8s.io/client-go/kubernetes"
    30  	"k8s.io/client-go/kubernetes/scheme"
    31  	"k8s.io/client-go/rest"
    32  	"k8s.io/client-go/tools/clientcmd"
    33  	"k8s.io/client-go/util/connrotation"
    34  	"sigs.k8s.io/controller-runtime/pkg/client"
    35  	"sigs.k8s.io/controller-runtime/pkg/log"
    36  
    37  	configv1 "github.com/openshift/api/config/v1"
    38  	mcfgv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1"
    39  
    40  	sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
    41  	snclientset "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/clientset/versioned"
    42  	"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
    43  	"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/daemon"
    44  	"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper"
    45  	snolog "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/log"
    46  	"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms"
    47  	"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars"
    48  )
    49  
    50  // stringList is a list of strings, implements pflag.Value interface
    51  type stringList []string
    52  
    53  func (sl *stringList) String() string {
    54  	return strings.Join(*sl, ",")
    55  }
    56  
    57  func (sl *stringList) Set(arg string) error {
    58  	elems := strings.Split(arg, ",")
    59  
    60  	for _, elem := range elems {
    61  		if len(elem) == 0 {
    62  			return fmt.Errorf("empty plugin name")
    63  		}
    64  		*sl = append(*sl, elem)
    65  	}
    66  	return nil
    67  }
    68  
    69  func (sl *stringList) Type() string {
    70  	return "CommaSeparatedString"
    71  }
    72  
    73  var (
    74  	startCmd = &cobra.Command{
    75  		Use:   "start",
    76  		Short: "Starts SR-IOV Network Config Daemon",
    77  		Long:  "",
    78  		RunE:  runStartCmd,
    79  	}
    80  
    81  	startOpts struct {
    82  		kubeconfig        string
    83  		nodeName          string
    84  		systemd           bool
    85  		disabledPlugins   stringList
    86  		parallelNicConfig bool
    87  	}
    88  )
    89  
    90  func init() {
    91  	rootCmd.AddCommand(startCmd)
    92  	startCmd.PersistentFlags().StringVar(&startOpts.kubeconfig, "kubeconfig", "", "Kubeconfig file to access a remote cluster (testing only)")
    93  	startCmd.PersistentFlags().StringVar(&startOpts.nodeName, "node-name", "", "kubernetes node name daemon is managing")
    94  	startCmd.PersistentFlags().BoolVar(&startOpts.systemd, "use-systemd-service", false, "use config daemon in systemd mode")
    95  	startCmd.PersistentFlags().VarP(&startOpts.disabledPlugins, "disable-plugins", "", "comma-separated list of plugins to disable")
    96  	startCmd.PersistentFlags().BoolVar(&startOpts.parallelNicConfig, "parallel-nic-config", false, "perform NIC configuration in parallel")
    97  }
    98  
    99  func runStartCmd(cmd *cobra.Command, args []string) error {
   100  	// init logger
   101  	snolog.InitLog()
   102  	setupLog := log.Log.WithName("sriov-network-config-daemon")
   103  
   104  	// Mark that we are running inside a container
   105  	vars.UsingSystemdMode = false
   106  	if startOpts.systemd {
   107  		vars.UsingSystemdMode = true
   108  	}
   109  
   110  	vars.ParallelNicConfig = startOpts.parallelNicConfig
   111  
   112  	if startOpts.nodeName == "" {
   113  		name, ok := os.LookupEnv("NODE_NAME")
   114  		if !ok || name == "" {
   115  			return fmt.Errorf("node-name is required")
   116  		}
   117  		startOpts.nodeName = name
   118  	}
   119  	vars.NodeName = startOpts.nodeName
   120  
   121  	for _, p := range startOpts.disabledPlugins {
   122  		if _, ok := vars.DisableablePlugins[p]; !ok {
   123  			return fmt.Errorf("%s plugin cannot be disabled", p)
   124  		}
   125  	}
   126  
   127  	// This channel is used to ensure all spawned goroutines exit when we exit.
   128  	stopCh := make(chan struct{})
   129  	defer close(stopCh)
   130  
   131  	// This channel is used to signal Run() something failed and to jump ship.
   132  	// It's purely a chan<- in the Daemon struct for goroutines to write to, and
   133  	// a <-chan in Run() for the main thread to listen on.
   134  	exitCh := make(chan error)
   135  	defer close(exitCh)
   136  
   137  	// This channel is to make sure main thread will wait until the writer finish
   138  	// to report lastSyncError in SriovNetworkNodeState object.
   139  	syncCh := make(chan struct{})
   140  	defer close(syncCh)
   141  
   142  	refreshCh := make(chan daemon.Message)
   143  	defer close(refreshCh)
   144  
   145  	var config *rest.Config
   146  	var err error
   147  
   148  	// On openshift we use the kubeconfig from kubelet on the node where the daemon is running
   149  	// this allow us to improve security as every daemon has access only to its own node
   150  	if vars.ClusterType == consts.ClusterTypeOpenshift {
   151  		kubeconfig, err := clientcmd.LoadFromFile("/host/etc/kubernetes/kubeconfig")
   152  		if err != nil {
   153  			setupLog.Error(err, "failed to load kubelet kubeconfig")
   154  		}
   155  		clusterName := kubeconfig.Contexts[kubeconfig.CurrentContext].Cluster
   156  		apiURL := kubeconfig.Clusters[clusterName].Server
   157  
   158  		urlPath, err := url.Parse(apiURL)
   159  		if err != nil {
   160  			setupLog.Error(err, "failed to parse api url from kubelet kubeconfig")
   161  		}
   162  
   163  		// The kubernetes in-cluster functions don't let you override the apiserver
   164  		// directly; gotta "pass" it via environment vars.
   165  		setupLog.V(0).Info("overriding kubernetes api", "new-url", apiURL)
   166  		err = os.Setenv("KUBERNETES_SERVICE_HOST", urlPath.Hostname())
   167  		if err != nil {
   168  			setupLog.Error(err, "failed to set KUBERNETES_SERVICE_HOST environment variable")
   169  		}
   170  		err = os.Setenv("KUBERNETES_SERVICE_PORT", urlPath.Port())
   171  		if err != nil {
   172  			setupLog.Error(err, "failed to set KUBERNETES_SERVICE_PORT environment variable")
   173  		}
   174  	}
   175  
   176  	kubeconfig := os.Getenv("KUBECONFIG")
   177  	if kubeconfig != "" {
   178  		config, err = clientcmd.BuildConfigFromFlags("", kubeconfig)
   179  	} else {
   180  		// creates the in-cluster config
   181  		config, err = rest.InClusterConfig()
   182  	}
   183  
   184  	if err != nil {
   185  		return err
   186  	}
   187  
   188  	vars.Config = config
   189  	vars.Scheme = scheme.Scheme
   190  
   191  	closeAllConns, err := updateDialer(config)
   192  	if err != nil {
   193  		return err
   194  	}
   195  
   196  	err = sriovnetworkv1.AddToScheme(scheme.Scheme)
   197  	if err != nil {
   198  		setupLog.Error(err, "failed to load sriov network CRDs to scheme")
   199  		return err
   200  	}
   201  
   202  	err = mcfgv1.AddToScheme(scheme.Scheme)
   203  	if err != nil {
   204  		setupLog.Error(err, "failed to load machine config CRDs to scheme")
   205  		return err
   206  	}
   207  
   208  	err = configv1.Install(scheme.Scheme)
   209  	if err != nil {
   210  		setupLog.Error(err, "failed to load openshift config CRDs to scheme")
   211  		return err
   212  	}
   213  
   214  	kClient, err := client.New(config, client.Options{Scheme: scheme.Scheme})
   215  	if err != nil {
   216  		setupLog.Error(err, "couldn't create client")
   217  		os.Exit(1)
   218  	}
   219  
   220  	snclient := snclientset.NewForConfigOrDie(config)
   221  	kubeclient := kubernetes.NewForConfigOrDie(config)
   222  
   223  	hostHelpers, err := helper.NewDefaultHostHelpers()
   224  	if err != nil {
   225  		setupLog.Error(err, "failed to create hostHelpers")
   226  		return err
   227  	}
   228  
   229  	platformHelper, err := platforms.NewDefaultPlatformHelper()
   230  	if err != nil {
   231  		setupLog.Error(err, "failed to create platformHelper")
   232  		return err
   233  	}
   234  
   235  	config.Timeout = 5 * time.Second
   236  	writerclient := snclientset.NewForConfigOrDie(config)
   237  
   238  	eventRecorder := daemon.NewEventRecorder(writerclient, kubeclient)
   239  	defer eventRecorder.Shutdown()
   240  
   241  	setupLog.V(0).Info("starting node writer")
   242  	nodeWriter := daemon.NewNodeStateStatusWriter(writerclient,
   243  		closeAllConns,
   244  		eventRecorder,
   245  		hostHelpers,
   246  		platformHelper)
   247  
   248  	nodeInfo, err := kubeclient.CoreV1().Nodes().Get(context.Background(), startOpts.nodeName, v1.GetOptions{})
   249  	if err == nil {
   250  		for key, pType := range vars.PlatformsMap {
   251  			if strings.Contains(strings.ToLower(nodeInfo.Spec.ProviderID), strings.ToLower(key)) {
   252  				vars.PlatformType = pType
   253  			}
   254  		}
   255  	} else {
   256  		setupLog.Error(err, "failed to fetch node state, exiting", "node-name", startOpts.nodeName)
   257  		return err
   258  	}
   259  	setupLog.Info("Running on", "platform", vars.PlatformType.String())
   260  
   261  	if err := sriovnetworkv1.InitNicIDMapFromConfigMap(kubeclient, vars.Namespace); err != nil {
   262  		setupLog.Error(err, "failed to run init NicIdMap")
   263  		return err
   264  	}
   265  
   266  	eventRecorder.SendEvent("ConfigDaemonStart", "Config Daemon starting")
   267  
   268  	// block the deamon process until nodeWriter finish first its run
   269  	err = nodeWriter.RunOnce()
   270  	if err != nil {
   271  		setupLog.Error(err, "failed to run writer")
   272  		return err
   273  	}
   274  	go nodeWriter.Run(stopCh, refreshCh, syncCh)
   275  
   276  	setupLog.V(0).Info("Starting SriovNetworkConfigDaemon")
   277  	err = daemon.New(
   278  		kClient,
   279  		snclient,
   280  		kubeclient,
   281  		hostHelpers,
   282  		platformHelper,
   283  		exitCh,
   284  		stopCh,
   285  		syncCh,
   286  		refreshCh,
   287  		eventRecorder,
   288  		startOpts.disabledPlugins,
   289  	).Run(stopCh, exitCh)
   290  	if err != nil {
   291  		setupLog.Error(err, "failed to run daemon")
   292  	}
   293  	setupLog.V(0).Info("Shutting down SriovNetworkConfigDaemon")
   294  	return err
   295  }
   296  
   297  // updateDialer instruments a restconfig with a dial. the returned function allows forcefully closing all active connections.
   298  func updateDialer(clientConfig *rest.Config) (func(), error) {
   299  	if clientConfig.Transport != nil || clientConfig.Dial != nil {
   300  		return nil, fmt.Errorf("there is already a transport or dialer configured")
   301  	}
   302  	f := &net.Dialer{Timeout: 30 * time.Second, KeepAlive: 30 * time.Second}
   303  	d := connrotation.NewDialer(f.DialContext)
   304  	clientConfig.Dial = d.DialContext
   305  	return d.CloseAll, nil
   306  }