github.com/verrazzano/verrazzano-monitoring-operator@v0.0.30/pkg/vmo/deployment.go (about) 1 // Copyright (C) 2020, 2022, Oracle and/or its affiliates. 2 // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl. 3 4 package vmo 5 6 import ( 7 "context" 8 "errors" 9 "fmt" 10 11 "github.com/verrazzano/pkg/diff" 12 vmcontrollerv1 "github.com/verrazzano/verrazzano-monitoring-operator/pkg/apis/vmcontroller/v1" 13 "github.com/verrazzano/verrazzano-monitoring-operator/pkg/config" 14 "github.com/verrazzano/verrazzano-monitoring-operator/pkg/constants" 15 "github.com/verrazzano/verrazzano-monitoring-operator/pkg/metricsexporter" 16 "github.com/verrazzano/verrazzano-monitoring-operator/pkg/resources" 17 "github.com/verrazzano/verrazzano-monitoring-operator/pkg/resources/deployments" 18 appsv1 "k8s.io/api/apps/v1" 19 k8serrors "k8s.io/apimachinery/pkg/api/errors" 20 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 21 "k8s.io/apimachinery/pkg/labels" 22 "k8s.io/apimachinery/pkg/util/runtime" 23 ) 24 25 func updateOpenSearchDashboardsDeployment(osd *appsv1.Deployment, controller *Controller, vmo *vmcontrollerv1.VerrazzanoMonitoringInstance) error { 26 if osd == nil { 27 return nil 28 } 29 var err error 30 31 // Wait for OS to be green before deploying OS Dashboards 32 if err = controller.osClient.IsGreen(vmo); err != nil { 33 return err 34 } 35 36 existingDeployment, err := controller.deploymentLister.Deployments(vmo.Namespace).Get(osd.Name) 37 if err != nil { 38 if k8serrors.IsNotFound(err) { 39 controller.log.Oncef("Creating deployment %s/%s", osd.Namespace, osd.Name) 40 // Initialize the replica count to one, and scale up one at a time during update. 41 // The OS Dashboard pods are being rolled out one at a time to avoid getting failures 42 // due to indices needing to be migrated. We considered using StatefulSets with a 43 // pod management policy of "ordered ready". However, StatefulSets do not support a 44 // deployment strategy of "recreate", which is also needed to avoid the migrating indices error. 45 osd.Spec.Replicas = resources.NewVal(int32(1)) 46 _, err = controller.kubeclientset.AppsV1().Deployments(vmo.Namespace).Create(context.TODO(), osd, metav1.CreateOptions{}) 47 } else { 48 return err 49 } 50 } else { 51 if err = controller.osClient.IsUpdated(vmo); err != nil { 52 return err 53 } 54 if existingDeployment.Status.AvailableReplicas == *existingDeployment.Spec.Replicas && 55 *resources.NewVal(vmo.Spec.Kibana.Replicas) > *existingDeployment.Spec.Replicas { 56 // Ok to scale up 57 *osd.Spec.Replicas = *existingDeployment.Spec.Replicas + 1 58 controller.log.Oncef("Incrementing replica count of deployment %s/%s to %d", osd.Namespace, osd.Name, *osd.Spec.Replicas) 59 } 60 if err = updateDeployment(controller, vmo, existingDeployment, osd); err == nil { 61 // Return a temporary error if not finished scaling up to the desired replica count 62 if *resources.NewVal(vmo.Spec.Kibana.Replicas) != *existingDeployment.Spec.Replicas { 63 return fmt.Errorf("waiting to bring OS Dashboards replica up to full count") 64 } 65 } 66 } 67 if err != nil { 68 if metric, metricErr := metricsexporter.GetErrorMetrics(metricsexporter.NamesDeploymentUpdateError); metricErr != nil { 69 controller.log.Errorf("Failed to get error metric %s: %v", metricsexporter.NamesDeploymentUpdateError, metricErr) 70 } else { 71 metric.Inc() 72 } 73 controller.log.Errorf("Failed to update deployment %s/%s: %v", osd.Namespace, osd.Name, err) 74 return err 75 } 76 77 return nil 78 } 79 80 // CreateDeployments create/update VMO deployment k8s resources 81 func CreateDeployments(controller *Controller, vmo *vmcontrollerv1.VerrazzanoMonitoringInstance, pvcToAdMap map[string]string, existingCluster bool) (dirty bool, err error) { 82 // The error count is incremented by the function which calls createDeployment 83 functionMetric, functionError := metricsexporter.GetFunctionMetrics(metricsexporter.NamesDeployment) 84 if functionError == nil { 85 functionMetric.LogStart() 86 defer functionMetric.LogEnd(false) 87 } else { 88 return false, functionError 89 } 90 91 // Assigning the following spec members seems like a hack; is any 92 // better way to make these values available where the deployments are created? 93 vmo.Spec.NatGatewayIPs = controller.operatorConfig.NatGatewayIPs 94 95 expected, err := deployments.New(vmo, controller.kubeclientset, controller.operatorConfig, pvcToAdMap) 96 if err != nil { 97 controller.log.Errorf("Failed to create Deployment specs for VMI %s: %v", vmo.Name, err) 98 return false, err 99 } 100 deployList := expected.Deployments 101 102 var openSearchDeployments []*appsv1.Deployment 103 var deploymentNames []string 104 controller.log.Oncef("Creating/updating ExpectedDeployments for VMI %s", vmo.Name) 105 for _, curDeployment := range deployList { 106 deploymentName := curDeployment.Name 107 deploymentNames = append(deploymentNames, deploymentName) 108 if deploymentName == "" && curDeployment.GenerateName == "" { 109 // We choose to absorb the error here as the worker would requeue the 110 // resource otherwise. Instead, the next time the resource is updated 111 // the resource will be queued again. 112 runtime.HandleError(errors.New("deployment name must be specified")) 113 return true, nil 114 } 115 controller.log.Debugf("Applying Deployment '%s' in namespace '%s' for VMI '%s'\n", deploymentName, vmo.Namespace, vmo.Name) 116 existingDeployment, err := controller.deploymentLister.Deployments(vmo.Namespace).Get(deploymentName) 117 118 if err != nil { 119 if k8serrors.IsNotFound(err) { 120 _, err = controller.kubeclientset.AppsV1().Deployments(vmo.Namespace).Create(context.TODO(), curDeployment, metav1.CreateOptions{}) 121 } else { 122 return false, err 123 } 124 } else if existingDeployment != nil { 125 if existingDeployment.Spec.Template.Labels[constants.ServiceAppLabel] == fmt.Sprintf("%s-%s", vmo.Name, config.ElasticsearchData.Name) { 126 openSearchDeployments = append(openSearchDeployments, curDeployment) 127 } else { 128 err = updateDeployment(controller, vmo, existingDeployment, curDeployment) 129 } 130 } 131 if err != nil { 132 if metric, metricErr := metricsexporter.GetErrorMetrics(metricsexporter.NamesDeploymentUpdateError); metricErr != nil { 133 controller.log.Errorf("Failed to get error metric %s: %v", metricsexporter.NamesDeploymentUpdateError, metricErr) 134 } else { 135 metric.Inc() 136 } 137 controller.log.Errorf("Failed to update deployment %s/%s: %v", curDeployment.Namespace, curDeployment.Name, err) 138 return false, err 139 } 140 } 141 142 openSearchDirty, err := updateOpenSearchDeployments(controller, vmo, openSearchDeployments, existingCluster) 143 if err != nil { 144 return false, err 145 } 146 147 // Create the OSD deployment 148 osd := deployments.NewOpenSearchDashboardsDeployment(vmo) 149 if osd != nil { 150 deploymentNames = append(deploymentNames, osd.Name) 151 err = updateOpenSearchDashboardsDeployment(osd, controller, vmo) 152 if err != nil { 153 return false, err 154 } 155 } 156 157 // Delete deployments that shouldn't exist 158 controller.log.Oncef("Deleting deployments that should not exist for VMI %s", vmo.Name) 159 selector := labels.SelectorFromSet(map[string]string{constants.VMOLabel: vmo.Name}) 160 existingDeploymentsList, err := controller.deploymentLister.Deployments(vmo.Namespace).List(selector) 161 if err != nil { 162 return false, err 163 } 164 for _, deployment := range existingDeploymentsList { 165 if !contains(deploymentNames, deployment.Name) { 166 // if processing an OpenSearch data node, and the data node is expected and running 167 // An OpenSearch health check should be made to prevent unexpected shard allocation 168 if deployments.IsOpenSearchDataDeployment(vmo.Name, deployment) && (expected.OpenSearchDataDeployments > 0 || deployment.Status.ReadyReplicas > 0) { 169 if err := controller.osClient.IsGreen(vmo); err != nil { 170 controller.log.Oncef("Scale down of deployment %s not allowed: cluster health is not green", deployment.Name) 171 continue 172 } 173 } 174 if err := deleteDeployment(controller, vmo, deployment); err != nil { 175 return false, err 176 } 177 } 178 } 179 180 return openSearchDirty, nil 181 } 182 183 func deleteDeployment(controller *Controller, vmo *vmcontrollerv1.VerrazzanoMonitoringInstance, deployment *appsv1.Deployment) error { 184 controller.log.Oncef("Deleting deployment %s/%s", deployment.Namespace, deployment.Name) 185 metric, err := metricsexporter.GetCounterMetrics(metricsexporter.NamesDeploymentDeleteCounter) 186 if err != nil { 187 // log it but continue on with deleting the deployment 188 controller.log.Errorf("Failed to get counter metric %s: %v", metricsexporter.NamesDeploymentDeleteCounter, err) 189 } else { 190 metric.Inc() 191 } 192 err = controller.kubeclientset.AppsV1().Deployments(vmo.Namespace).Delete(context.TODO(), deployment.Name, metav1.DeleteOptions{}) 193 if err != nil { 194 controller.log.Errorf("Failed to delete deployment %s: %v", deployment.Name, err) 195 if metric, metricErr := metricsexporter.GetErrorMetrics(metricsexporter.NamesDeploymentDeleteError); metricErr != nil { 196 controller.log.Errorf("Failed to get error metric %s: %v", metricsexporter.NamesDeploymentDeleteError, metricErr) 197 } else { 198 metric.Inc() 199 } 200 return err 201 } 202 return nil 203 } 204 205 func updateDeployment(controller *Controller, vmo *vmcontrollerv1.VerrazzanoMonitoringInstance, existingDeployment, curDeployment *appsv1.Deployment) error { 206 if metric, metricErr := metricsexporter.GetCounterMetrics(metricsexporter.NamesDeploymentUpdateCounter); metricErr != nil { 207 controller.log.Errorf("Failed to get error metric %s: %v", metricsexporter.NamesDeploymentUpdateCounter, metricErr) 208 } else { 209 metric.Inc() 210 } 211 var err error 212 curDeployment.Spec.Selector = existingDeployment.Spec.Selector 213 specDiffs := diff.Diff(existingDeployment, curDeployment) 214 if specDiffs != "" { 215 controller.log.Oncef("Deployment %s/%s has spec differences %s", curDeployment.Namespace, curDeployment.Name, specDiffs) 216 controller.log.Oncef("Updating deployment %s/%s", curDeployment.Namespace, curDeployment.Name) 217 _, err = controller.kubeclientset.AppsV1().Deployments(vmo.Namespace).Update(context.TODO(), curDeployment, metav1.UpdateOptions{}) 218 } 219 220 return err 221 } 222 223 // Updates the *next* candidate deployment of the given deployments list. A deployment is a candidate only if 224 // its predecessors in the list have already been updated and are fully up and running. 225 // return false if 1) no errors occurred, and 2) no work was done 226 func rollingUpdate(controller *Controller, vmo *vmcontrollerv1.VerrazzanoMonitoringInstance, deployments []*appsv1.Deployment) (dirty bool, err error) { 227 for index, current := range deployments { 228 existing, err := controller.deploymentLister.Deployments(vmo.Namespace).Get(current.Name) 229 if err != nil { 230 return false, err 231 } 232 233 // check if the current node is ready to be updated. If it can't, skip it for the next reconcile 234 if !isUpdateAllowed(controller, vmo, current) { 235 continue 236 } 237 metric, metricErr := metricsexporter.GetCounterMetrics(metricsexporter.NamesDeploymentUpdateCounter) 238 if metricErr != nil { 239 return false, metricErr 240 } 241 metric.Inc() 242 // Selector may not change, so we copy over from existing 243 current.Spec.Selector = existing.Spec.Selector 244 // Deployment spec differences, so call Update() and return 245 specDiffs := diff.Diff(existing, current) 246 if specDiffs != "" { 247 controller.log.Debugf("Deployment %s : Spec differences %s", current.Name, specDiffs) 248 controller.log.Oncef("Updating deployment %s in namespace %s", current.Name, current.Namespace) 249 _, err = controller.kubeclientset.AppsV1().Deployments(vmo.Namespace).Update(context.TODO(), current, metav1.UpdateOptions{}) 250 if err != nil { 251 if metric, metricErr := metricsexporter.GetErrorMetrics(metricsexporter.NamesDeploymentUpdateError); err != nil { 252 controller.log.Errorf("Failed to get error metric %s: %v", metricsexporter.NamesDeploymentUpdateError, metricErr) 253 } else { 254 metric.Inc() 255 } 256 return false, err 257 } 258 //okay to return dirty=false after updating the *last* deployment 259 return index < len(deployments)-1, nil 260 } 261 // If the (already updated) deployment is not fully up and running, then return 262 if existing.Status.Replicas != 1 || existing.Status.Replicas != existing.Status.AvailableReplicas { 263 return true, nil 264 } 265 } 266 return false, nil 267 } 268 269 func updateOpenSearchDeployments(controller *Controller, vmo *vmcontrollerv1.VerrazzanoMonitoringInstance, deployments []*appsv1.Deployment, existingCluster bool) (dirty bool, err error) { 270 // if the cluster isn't up, patch all deployments sequentially 271 if !existingCluster { 272 return updateAllDeployments(controller, vmo, deployments) 273 } 274 // if the cluster is running, do a rolling update of each deployment 275 return rollingUpdate(controller, vmo, deployments) 276 } 277 278 // Update all deployments in the list concurrently 279 func updateAllDeployments(controller *Controller, vmo *vmcontrollerv1.VerrazzanoMonitoringInstance, deployments []*appsv1.Deployment) (dirty bool, err error) { 280 for _, curDeployment := range deployments { 281 _, err := controller.deploymentLister.Deployments(vmo.Namespace).Get(curDeployment.Name) 282 if err != nil { 283 return false, err 284 } 285 metric, metricErr := metricsexporter.GetCounterMetrics(metricsexporter.NamesDeploymentUpdateCounter) 286 if metricErr != nil { 287 return false, metricErr 288 } 289 metric.Inc() 290 controller.log.Oncef("Updating deployment %s in namespace %s", curDeployment.Name, curDeployment.Namespace) 291 _, err = controller.kubeclientset.AppsV1().Deployments(vmo.Namespace).Update(context.TODO(), curDeployment, metav1.UpdateOptions{}) 292 if err != nil { 293 if metric, metricErr := metricsexporter.GetErrorMetrics(metricsexporter.NamesDeploymentUpdateError); metricErr != nil { 294 controller.log.Errorf("Failed to get error metric %s: %v", metricsexporter.NamesDeploymentUpdateError, metricErr) 295 } else { 296 metric.Inc() 297 } 298 return false, err 299 } 300 } 301 return false, nil 302 } 303 304 // isUpdateAllowed checks if OpenSearch nodes are allowed to update. If a data node is removed when the cluster is yellow, 305 // data loss may occur. 306 func isUpdateAllowed(controller *Controller, vmo *vmcontrollerv1.VerrazzanoMonitoringInstance, current *appsv1.Deployment) bool { 307 // if current is an OpenSearch data node 308 if deployments.IsOpenSearchDataDeployment(vmo.Namespace, current) { 309 // if the node is down, we should try to fix it 310 if current.Status.ReadyReplicas == 0 { 311 return true 312 } 313 314 // if the node is running, we shouldn't take it down unless the cluster is green (to avoid data loss) 315 if err := controller.osClient.IsGreen(vmo); err != nil { 316 controller.log.Oncef("OpenSearch node %s was not upgraded, since the cluster is not ready", current.Name) 317 return false 318 } 319 } 320 return true 321 }