github.com/openshift/dpu-operator@v0.0.0-20240502153209-3af840d137c2/internal/controller/dpuoperatorconfig_controller.go (about)

     1  /*
     2  Copyright 2024.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package controller
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"os"
    23  
    24  	"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/apply"
    25  	"github.com/openshift/cluster-network-operator/pkg/render"
    26  	configv1 "github.com/openshift/dpu-operator/api/v1"
    27  	appsv1 "k8s.io/api/apps/v1"
    28  	"k8s.io/apimachinery/pkg/api/errors"
    29  	"k8s.io/apimachinery/pkg/runtime"
    30  	ctrl "sigs.k8s.io/controller-runtime"
    31  	"sigs.k8s.io/controller-runtime/pkg/client"
    32  	"sigs.k8s.io/controller-runtime/pkg/log"
    33  )
    34  
    35  // DpuOperatorConfigReconciler reconciles a DpuOperatorConfig object
    36  type DpuOperatorConfigReconciler struct {
    37  	client.Client
    38  	Scheme *runtime.Scheme
    39  }
    40  
    41  //+kubebuilder:rbac:groups=config.openshift.io,resources=dpuoperatorconfigs,verbs=get;list;watch;create;update;patch;delete
    42  //+kubebuilder:rbac:groups=config.openshift.io,resources=dpuoperatorconfigs/status,verbs=get;update;patch
    43  //+kubebuilder:rbac:groups=config.openshift.io,resources=dpuoperatorconfigs/finalizers,verbs=update
    44  //+kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=get;list;watch;create;update;patch;delete
    45  //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=rolebindings,verbs=get;list;watch;create;update;patch;delete
    46  //+kubebuilder:rbac:groups="",resources=roles,resources=*,verbs=get;list;watch;create;update;patch;delete
    47  //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=roles,verbs=get;list;watch;create;update;patch;delete
    48  //+kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,resourceNames=anyuid;hostnetwork;privileged,verbs=use
    49  //+kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=get;list;watch;create;update;patch;delete
    50  //+kubebuilder:rbac:groups=k8s.cni.cncf.io,resources=network-attachment-definitions,verbs=get;list;watch;create;update;patch;delete
    51  
    52  // Reconcile is part of the main kubernetes reconciliation loop which aims to
    53  // move the current state of the cluster closer to the desired state.
    54  // TODO(user): Modify the Reconcile function to compare the state specified by
    55  // the DpuOperatorConfig object against the actual cluster state, and then
    56  // perform operations to make the cluster state reflect the state specified by
    57  // the user.
    58  //
    59  // For more details, check Reconcile and its Result here:
    60  // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.15.0/pkg/reconcile
    61  func (r *DpuOperatorConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
    62  	logger := log.FromContext(ctx)
    63  
    64  	dpuOperatorConfig := &configv1.DpuOperatorConfig{}
    65  	if err := r.Get(ctx, req.NamespacedName, dpuOperatorConfig); err != nil {
    66  		if errors.IsNotFound(err) {
    67  			logger.Info("DpuOperatorConfig resource not found. Ignoring.")
    68  			return ctrl.Result{}, nil
    69  		}
    70  		logger.Error(err, "Failed to get DpuOperatorConfig resource")
    71  		return ctrl.Result{}, err
    72  	}
    73  	err := r.ensureDpuDeamonSetRunning(ctx, dpuOperatorConfig)
    74  	if err != nil {
    75  		logger.Error(err, "Failed to ensure Daemon is running")
    76  	}
    77  	err = r.ensureSriovDevicePluginRunning(ctx, dpuOperatorConfig)
    78  	if err != nil {
    79  		logger.Error(err, "Failed to ensure SRIOV Device Plugin DaemonSet is running")
    80  	}
    81  	err = r.createNetworkFunctionNad(ctx, dpuOperatorConfig)
    82  	if err != nil {
    83  		logger.Error(err, "Failed to create Network Function NAD")
    84  	}
    85  
    86  	return ctrl.Result{}, nil
    87  }
    88  
    89  func getImagePullPolicy() string {
    90  	if value, ok := os.LookupEnv("IMAGE_PULL_POLICIES"); ok {
    91  		return value
    92  	}
    93  	return "IfNotPresent"
    94  }
    95  
    96  func setCommonData(data *render.RenderData, cfg *configv1.DpuOperatorConfig) {
    97  	data.Data["Namespace"] = cfg.Namespace
    98  	data.Data["ImagePullPolicy"] = getImagePullPolicy()
    99  }
   100  
   101  func (r *DpuOperatorConfigReconciler) ensureDpuDeamonSetRunning(ctx context.Context, cfg *configv1.DpuOperatorConfig) error {
   102  	var err error
   103  
   104  	logger := log.FromContext(ctx)
   105  	data := render.MakeRenderData()
   106  	// All the CRs will be in the same namespace as the operator config
   107  	setCommonData(&data, cfg)
   108  	data.Data["Mode"] = cfg.Spec.Mode
   109  	dpuDaemonImage := os.Getenv("DPU_DAEMON_IMAGE")
   110  	if dpuDaemonImage == "" {
   111  		return fmt.Errorf("DPU_DAEMON_IMAGE not set")
   112  	}
   113  	data.Data["DpuOperatorDaemonImage"] = dpuDaemonImage
   114  
   115  	logger.Info("Ensuring that DPU DaemonSet is running", "image", dpuDaemonImage)
   116  	objs, err := render.RenderDir("./bindata/daemon", &data)
   117  	if err != nil {
   118  		logger.Error(err, "Failed to render dpu daemon manifests")
   119  		return err
   120  	}
   121  
   122  	for _, obj := range objs {
   123  		if err := ctrl.SetControllerReference(cfg, obj, r.Scheme); err != nil {
   124  			return err
   125  		}
   126  	}
   127  
   128  	for _, obj := range objs {
   129  		logger.Info("Preparing CR", "kind", obj.GetKind())
   130  		if obj.GetKind() == "DaemonSet" {
   131  			scheme := r.Scheme
   132  			ds := &appsv1.DaemonSet{}
   133  			err = scheme.Convert(obj, ds, nil)
   134  			if err != nil {
   135  				logger.Error(err, "Fail to convert to DaemonSet")
   136  				return err
   137  			}
   138  			ds.Spec.Template.Spec.NodeSelector["dpu"] = "true"
   139  			err = scheme.Convert(ds, obj, nil)
   140  			if err != nil {
   141  				logger.Error(err, "Fail to convert to Unstructured")
   142  				return err
   143  			}
   144  		}
   145  		if err := apply.ApplyObject(context.TODO(), r.Client, obj); err != nil {
   146  			return fmt.Errorf("failed to apply object %v with err: %v", obj, err)
   147  		}
   148  	}
   149  	return nil
   150  }
   151  
   152  func (r *DpuOperatorConfigReconciler) ensureSriovDevicePluginRunning(ctx context.Context, cfg *configv1.DpuOperatorConfig) error {
   153  	logger := log.FromContext(ctx)
   154  	// There will be a device plugin running in the daemon
   155  	if cfg.Spec.Mode == "host" {
   156  		data := render.MakeRenderData()
   157  		// All the CRs will be in the same namespace as the operator config
   158  		setCommonData(&data, cfg)
   159  
   160  		logger.Info("Ensuring that SRIOV Device Plugin DaemonSet is running")
   161  		objs, err := render.RenderDir("./bindata/sriov-device-plugin", &data)
   162  		if err != nil {
   163  			logger.Error(err, "Failed to render SRIOV Device Plugin DaemonSet manifests")
   164  			return err
   165  		}
   166  
   167  		for _, obj := range objs {
   168  			if err := ctrl.SetControllerReference(cfg, obj, r.Scheme); err != nil {
   169  				return err
   170  			}
   171  			if err := apply.ApplyObject(context.TODO(), r.Client, obj); err != nil {
   172  				return fmt.Errorf("failed to apply object %v with err: %v", obj, err)
   173  			}
   174  		}
   175  	}
   176  	return nil
   177  }
   178  
   179  func (r *DpuOperatorConfigReconciler) createNetworkFunctionNad(ctx context.Context, cfg *configv1.DpuOperatorConfig) error {
   180  	logger := log.FromContext(ctx)
   181  	if cfg.Spec.Mode == "dpu" {
   182  		data := render.MakeRenderData()
   183  		// All the CRs will be in the same namespace as the operator config
   184  		setCommonData(&data, cfg)
   185  		data.Data["ResourceName"] = "openshift.io/dpu" // FIXME: Hardcode for now
   186  
   187  		logger.Info("Create the Network Function NAD")
   188  		objs, err := render.RenderDir("./bindata/networkfn-nad", &data)
   189  		if err != nil {
   190  			logger.Error(err, "Failed to render SRIOV Device Plugin DaemonSet manifests")
   191  			return err
   192  		}
   193  
   194  		for _, obj := range objs {
   195  			if err := ctrl.SetControllerReference(cfg, obj, r.Scheme); err != nil {
   196  				return err
   197  			}
   198  			if err := apply.ApplyObject(context.TODO(), r.Client, obj); err != nil {
   199  				return fmt.Errorf("failed to apply object %v with err: %v", obj, err)
   200  			}
   201  		}
   202  	}
   203  	return nil
   204  }
   205  
   206  // SetupWithManager sets up the controller with the Manager.
   207  func (r *DpuOperatorConfigReconciler) SetupWithManager(mgr ctrl.Manager) error {
   208  	return ctrl.NewControllerManagedBy(mgr).
   209  		For(&configv1.DpuOperatorConfig{}).
   210  		Complete(r)
   211  }