github.com/openshift/dpu-operator@v0.0.0-20240502153209-3af840d137c2/internal/controller/dpuoperatorconfig_controller.go (about) 1 /* 2 Copyright 2024. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package controller 18 19 import ( 20 "context" 21 "fmt" 22 "os" 23 24 "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/apply" 25 "github.com/openshift/cluster-network-operator/pkg/render" 26 configv1 "github.com/openshift/dpu-operator/api/v1" 27 appsv1 "k8s.io/api/apps/v1" 28 "k8s.io/apimachinery/pkg/api/errors" 29 "k8s.io/apimachinery/pkg/runtime" 30 ctrl "sigs.k8s.io/controller-runtime" 31 "sigs.k8s.io/controller-runtime/pkg/client" 32 "sigs.k8s.io/controller-runtime/pkg/log" 33 ) 34 35 // DpuOperatorConfigReconciler reconciles a DpuOperatorConfig object 36 type DpuOperatorConfigReconciler struct { 37 client.Client 38 Scheme *runtime.Scheme 39 } 40 41 //+kubebuilder:rbac:groups=config.openshift.io,resources=dpuoperatorconfigs,verbs=get;list;watch;create;update;patch;delete 42 //+kubebuilder:rbac:groups=config.openshift.io,resources=dpuoperatorconfigs/status,verbs=get;update;patch 43 //+kubebuilder:rbac:groups=config.openshift.io,resources=dpuoperatorconfigs/finalizers,verbs=update 44 //+kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=get;list;watch;create;update;patch;delete 45 //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=rolebindings,verbs=get;list;watch;create;update;patch;delete 46 //+kubebuilder:rbac:groups="",resources=roles,resources=*,verbs=get;list;watch;create;update;patch;delete 47 //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=roles,verbs=get;list;watch;create;update;patch;delete 48 //+kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,resourceNames=anyuid;hostnetwork;privileged,verbs=use 49 //+kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=get;list;watch;create;update;patch;delete 50 //+kubebuilder:rbac:groups=k8s.cni.cncf.io,resources=network-attachment-definitions,verbs=get;list;watch;create;update;patch;delete 51 52 // Reconcile is part of the main kubernetes reconciliation loop which aims to 53 // move the current state of the cluster closer to the desired state. 54 // TODO(user): Modify the Reconcile function to compare the state specified by 55 // the DpuOperatorConfig object against the actual cluster state, and then 56 // perform operations to make the cluster state reflect the state specified by 57 // the user. 58 // 59 // For more details, check Reconcile and its Result here: 60 // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.15.0/pkg/reconcile 61 func (r *DpuOperatorConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { 62 logger := log.FromContext(ctx) 63 64 dpuOperatorConfig := &configv1.DpuOperatorConfig{} 65 if err := r.Get(ctx, req.NamespacedName, dpuOperatorConfig); err != nil { 66 if errors.IsNotFound(err) { 67 logger.Info("DpuOperatorConfig resource not found. Ignoring.") 68 return ctrl.Result{}, nil 69 } 70 logger.Error(err, "Failed to get DpuOperatorConfig resource") 71 return ctrl.Result{}, err 72 } 73 err := r.ensureDpuDeamonSetRunning(ctx, dpuOperatorConfig) 74 if err != nil { 75 logger.Error(err, "Failed to ensure Daemon is running") 76 } 77 err = r.ensureSriovDevicePluginRunning(ctx, dpuOperatorConfig) 78 if err != nil { 79 logger.Error(err, "Failed to ensure SRIOV Device Plugin DaemonSet is running") 80 } 81 err = r.createNetworkFunctionNad(ctx, dpuOperatorConfig) 82 if err != nil { 83 logger.Error(err, "Failed to create Network Function NAD") 84 } 85 86 return ctrl.Result{}, nil 87 } 88 89 func getImagePullPolicy() string { 90 if value, ok := os.LookupEnv("IMAGE_PULL_POLICIES"); ok { 91 return value 92 } 93 return "IfNotPresent" 94 } 95 96 func setCommonData(data *render.RenderData, cfg *configv1.DpuOperatorConfig) { 97 data.Data["Namespace"] = cfg.Namespace 98 data.Data["ImagePullPolicy"] = getImagePullPolicy() 99 } 100 101 func (r *DpuOperatorConfigReconciler) ensureDpuDeamonSetRunning(ctx context.Context, cfg *configv1.DpuOperatorConfig) error { 102 var err error 103 104 logger := log.FromContext(ctx) 105 data := render.MakeRenderData() 106 // All the CRs will be in the same namespace as the operator config 107 setCommonData(&data, cfg) 108 data.Data["Mode"] = cfg.Spec.Mode 109 dpuDaemonImage := os.Getenv("DPU_DAEMON_IMAGE") 110 if dpuDaemonImage == "" { 111 return fmt.Errorf("DPU_DAEMON_IMAGE not set") 112 } 113 data.Data["DpuOperatorDaemonImage"] = dpuDaemonImage 114 115 logger.Info("Ensuring that DPU DaemonSet is running", "image", dpuDaemonImage) 116 objs, err := render.RenderDir("./bindata/daemon", &data) 117 if err != nil { 118 logger.Error(err, "Failed to render dpu daemon manifests") 119 return err 120 } 121 122 for _, obj := range objs { 123 if err := ctrl.SetControllerReference(cfg, obj, r.Scheme); err != nil { 124 return err 125 } 126 } 127 128 for _, obj := range objs { 129 logger.Info("Preparing CR", "kind", obj.GetKind()) 130 if obj.GetKind() == "DaemonSet" { 131 scheme := r.Scheme 132 ds := &appsv1.DaemonSet{} 133 err = scheme.Convert(obj, ds, nil) 134 if err != nil { 135 logger.Error(err, "Fail to convert to DaemonSet") 136 return err 137 } 138 ds.Spec.Template.Spec.NodeSelector["dpu"] = "true" 139 err = scheme.Convert(ds, obj, nil) 140 if err != nil { 141 logger.Error(err, "Fail to convert to Unstructured") 142 return err 143 } 144 } 145 if err := apply.ApplyObject(context.TODO(), r.Client, obj); err != nil { 146 return fmt.Errorf("failed to apply object %v with err: %v", obj, err) 147 } 148 } 149 return nil 150 } 151 152 func (r *DpuOperatorConfigReconciler) ensureSriovDevicePluginRunning(ctx context.Context, cfg *configv1.DpuOperatorConfig) error { 153 logger := log.FromContext(ctx) 154 // There will be a device plugin running in the daemon 155 if cfg.Spec.Mode == "host" { 156 data := render.MakeRenderData() 157 // All the CRs will be in the same namespace as the operator config 158 setCommonData(&data, cfg) 159 160 logger.Info("Ensuring that SRIOV Device Plugin DaemonSet is running") 161 objs, err := render.RenderDir("./bindata/sriov-device-plugin", &data) 162 if err != nil { 163 logger.Error(err, "Failed to render SRIOV Device Plugin DaemonSet manifests") 164 return err 165 } 166 167 for _, obj := range objs { 168 if err := ctrl.SetControllerReference(cfg, obj, r.Scheme); err != nil { 169 return err 170 } 171 if err := apply.ApplyObject(context.TODO(), r.Client, obj); err != nil { 172 return fmt.Errorf("failed to apply object %v with err: %v", obj, err) 173 } 174 } 175 } 176 return nil 177 } 178 179 func (r *DpuOperatorConfigReconciler) createNetworkFunctionNad(ctx context.Context, cfg *configv1.DpuOperatorConfig) error { 180 logger := log.FromContext(ctx) 181 if cfg.Spec.Mode == "dpu" { 182 data := render.MakeRenderData() 183 // All the CRs will be in the same namespace as the operator config 184 setCommonData(&data, cfg) 185 data.Data["ResourceName"] = "openshift.io/dpu" // FIXME: Hardcode for now 186 187 logger.Info("Create the Network Function NAD") 188 objs, err := render.RenderDir("./bindata/networkfn-nad", &data) 189 if err != nil { 190 logger.Error(err, "Failed to render SRIOV Device Plugin DaemonSet manifests") 191 return err 192 } 193 194 for _, obj := range objs { 195 if err := ctrl.SetControllerReference(cfg, obj, r.Scheme); err != nil { 196 return err 197 } 198 if err := apply.ApplyObject(context.TODO(), r.Client, obj); err != nil { 199 return fmt.Errorf("failed to apply object %v with err: %v", obj, err) 200 } 201 } 202 } 203 return nil 204 } 205 206 // SetupWithManager sets up the controller with the Manager. 207 func (r *DpuOperatorConfigReconciler) SetupWithManager(mgr ctrl.Manager) error { 208 return ctrl.NewControllerManagedBy(mgr). 209 For(&configv1.DpuOperatorConfig{}). 210 Complete(r) 211 }