github.com/verrazzano/verrazzano-monitoring-operator@v0.0.30/pkg/vmo/deployment.go (about)

     1  // Copyright (C) 2020, 2022, Oracle and/or its affiliates.
     2  // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
     3  
     4  package vmo
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  
    11  	"github.com/verrazzano/pkg/diff"
    12  	vmcontrollerv1 "github.com/verrazzano/verrazzano-monitoring-operator/pkg/apis/vmcontroller/v1"
    13  	"github.com/verrazzano/verrazzano-monitoring-operator/pkg/config"
    14  	"github.com/verrazzano/verrazzano-monitoring-operator/pkg/constants"
    15  	"github.com/verrazzano/verrazzano-monitoring-operator/pkg/metricsexporter"
    16  	"github.com/verrazzano/verrazzano-monitoring-operator/pkg/resources"
    17  	"github.com/verrazzano/verrazzano-monitoring-operator/pkg/resources/deployments"
    18  	appsv1 "k8s.io/api/apps/v1"
    19  	k8serrors "k8s.io/apimachinery/pkg/api/errors"
    20  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    21  	"k8s.io/apimachinery/pkg/labels"
    22  	"k8s.io/apimachinery/pkg/util/runtime"
    23  )
    24  
    25  func updateOpenSearchDashboardsDeployment(osd *appsv1.Deployment, controller *Controller, vmo *vmcontrollerv1.VerrazzanoMonitoringInstance) error {
    26  	if osd == nil {
    27  		return nil
    28  	}
    29  	var err error
    30  
    31  	// Wait for OS to be green before deploying OS Dashboards
    32  	if err = controller.osClient.IsGreen(vmo); err != nil {
    33  		return err
    34  	}
    35  
    36  	existingDeployment, err := controller.deploymentLister.Deployments(vmo.Namespace).Get(osd.Name)
    37  	if err != nil {
    38  		if k8serrors.IsNotFound(err) {
    39  			controller.log.Oncef("Creating deployment %s/%s", osd.Namespace, osd.Name)
    40  			// Initialize the replica count to one, and scale up one at a time during update.
    41  			// The OS Dashboard pods are being rolled out one at a time to avoid getting failures
    42  			// due to indices needing to be migrated.  We considered using StatefulSets with a
    43  			// pod management policy of "ordered ready".  However, StatefulSets do not support a
    44  			// deployment strategy of "recreate", which is also needed to avoid the migrating indices error.
    45  			osd.Spec.Replicas = resources.NewVal(int32(1))
    46  			_, err = controller.kubeclientset.AppsV1().Deployments(vmo.Namespace).Create(context.TODO(), osd, metav1.CreateOptions{})
    47  		} else {
    48  			return err
    49  		}
    50  	} else {
    51  		if err = controller.osClient.IsUpdated(vmo); err != nil {
    52  			return err
    53  		}
    54  		if existingDeployment.Status.AvailableReplicas == *existingDeployment.Spec.Replicas &&
    55  			*resources.NewVal(vmo.Spec.Kibana.Replicas) > *existingDeployment.Spec.Replicas {
    56  			// Ok to scale up
    57  			*osd.Spec.Replicas = *existingDeployment.Spec.Replicas + 1
    58  			controller.log.Oncef("Incrementing replica count of deployment %s/%s to %d", osd.Namespace, osd.Name, *osd.Spec.Replicas)
    59  		}
    60  		if err = updateDeployment(controller, vmo, existingDeployment, osd); err == nil {
    61  			// Return a temporary error if not finished scaling up to the desired replica count
    62  			if *resources.NewVal(vmo.Spec.Kibana.Replicas) != *existingDeployment.Spec.Replicas {
    63  				return fmt.Errorf("waiting to bring OS Dashboards replica up to full count")
    64  			}
    65  		}
    66  	}
    67  	if err != nil {
    68  		if metric, metricErr := metricsexporter.GetErrorMetrics(metricsexporter.NamesDeploymentUpdateError); metricErr != nil {
    69  			controller.log.Errorf("Failed to get error metric %s: %v", metricsexporter.NamesDeploymentUpdateError, metricErr)
    70  		} else {
    71  			metric.Inc()
    72  		}
    73  		controller.log.Errorf("Failed to update deployment %s/%s: %v", osd.Namespace, osd.Name, err)
    74  		return err
    75  	}
    76  
    77  	return nil
    78  }
    79  
    80  // CreateDeployments create/update VMO deployment k8s resources
    81  func CreateDeployments(controller *Controller, vmo *vmcontrollerv1.VerrazzanoMonitoringInstance, pvcToAdMap map[string]string, existingCluster bool) (dirty bool, err error) {
    82  	// The error count is incremented by the function which calls createDeployment
    83  	functionMetric, functionError := metricsexporter.GetFunctionMetrics(metricsexporter.NamesDeployment)
    84  	if functionError == nil {
    85  		functionMetric.LogStart()
    86  		defer functionMetric.LogEnd(false)
    87  	} else {
    88  		return false, functionError
    89  	}
    90  
    91  	// Assigning the following spec members seems like a hack; is any
    92  	// better way to make these values available where the deployments are created?
    93  	vmo.Spec.NatGatewayIPs = controller.operatorConfig.NatGatewayIPs
    94  
    95  	expected, err := deployments.New(vmo, controller.kubeclientset, controller.operatorConfig, pvcToAdMap)
    96  	if err != nil {
    97  		controller.log.Errorf("Failed to create Deployment specs for VMI %s: %v", vmo.Name, err)
    98  		return false, err
    99  	}
   100  	deployList := expected.Deployments
   101  
   102  	var openSearchDeployments []*appsv1.Deployment
   103  	var deploymentNames []string
   104  	controller.log.Oncef("Creating/updating ExpectedDeployments for VMI %s", vmo.Name)
   105  	for _, curDeployment := range deployList {
   106  		deploymentName := curDeployment.Name
   107  		deploymentNames = append(deploymentNames, deploymentName)
   108  		if deploymentName == "" && curDeployment.GenerateName == "" {
   109  			// We choose to absorb the error here as the worker would requeue the
   110  			// resource otherwise. Instead, the next time the resource is updated
   111  			// the resource will be queued again.
   112  			runtime.HandleError(errors.New("deployment name must be specified"))
   113  			return true, nil
   114  		}
   115  		controller.log.Debugf("Applying Deployment '%s' in namespace '%s' for VMI '%s'\n", deploymentName, vmo.Namespace, vmo.Name)
   116  		existingDeployment, err := controller.deploymentLister.Deployments(vmo.Namespace).Get(deploymentName)
   117  
   118  		if err != nil {
   119  			if k8serrors.IsNotFound(err) {
   120  				_, err = controller.kubeclientset.AppsV1().Deployments(vmo.Namespace).Create(context.TODO(), curDeployment, metav1.CreateOptions{})
   121  			} else {
   122  				return false, err
   123  			}
   124  		} else if existingDeployment != nil {
   125  			if existingDeployment.Spec.Template.Labels[constants.ServiceAppLabel] == fmt.Sprintf("%s-%s", vmo.Name, config.ElasticsearchData.Name) {
   126  				openSearchDeployments = append(openSearchDeployments, curDeployment)
   127  			} else {
   128  				err = updateDeployment(controller, vmo, existingDeployment, curDeployment)
   129  			}
   130  		}
   131  		if err != nil {
   132  			if metric, metricErr := metricsexporter.GetErrorMetrics(metricsexporter.NamesDeploymentUpdateError); metricErr != nil {
   133  				controller.log.Errorf("Failed to get error metric %s: %v", metricsexporter.NamesDeploymentUpdateError, metricErr)
   134  			} else {
   135  				metric.Inc()
   136  			}
   137  			controller.log.Errorf("Failed to update deployment %s/%s: %v", curDeployment.Namespace, curDeployment.Name, err)
   138  			return false, err
   139  		}
   140  	}
   141  
   142  	openSearchDirty, err := updateOpenSearchDeployments(controller, vmo, openSearchDeployments, existingCluster)
   143  	if err != nil {
   144  		return false, err
   145  	}
   146  
   147  	// Create the OSD deployment
   148  	osd := deployments.NewOpenSearchDashboardsDeployment(vmo)
   149  	if osd != nil {
   150  		deploymentNames = append(deploymentNames, osd.Name)
   151  		err = updateOpenSearchDashboardsDeployment(osd, controller, vmo)
   152  		if err != nil {
   153  			return false, err
   154  		}
   155  	}
   156  
   157  	// Delete deployments that shouldn't exist
   158  	controller.log.Oncef("Deleting deployments that should not exist for VMI %s", vmo.Name)
   159  	selector := labels.SelectorFromSet(map[string]string{constants.VMOLabel: vmo.Name})
   160  	existingDeploymentsList, err := controller.deploymentLister.Deployments(vmo.Namespace).List(selector)
   161  	if err != nil {
   162  		return false, err
   163  	}
   164  	for _, deployment := range existingDeploymentsList {
   165  		if !contains(deploymentNames, deployment.Name) {
   166  			// if processing an OpenSearch data node, and the data node is expected and running
   167  			// An OpenSearch health check should be made to prevent unexpected shard allocation
   168  			if deployments.IsOpenSearchDataDeployment(vmo.Name, deployment) && (expected.OpenSearchDataDeployments > 0 || deployment.Status.ReadyReplicas > 0) {
   169  				if err := controller.osClient.IsGreen(vmo); err != nil {
   170  					controller.log.Oncef("Scale down of deployment %s not allowed: cluster health is not green", deployment.Name)
   171  					continue
   172  				}
   173  			}
   174  			if err := deleteDeployment(controller, vmo, deployment); err != nil {
   175  				return false, err
   176  			}
   177  		}
   178  	}
   179  
   180  	return openSearchDirty, nil
   181  }
   182  
   183  func deleteDeployment(controller *Controller, vmo *vmcontrollerv1.VerrazzanoMonitoringInstance, deployment *appsv1.Deployment) error {
   184  	controller.log.Oncef("Deleting deployment %s/%s", deployment.Namespace, deployment.Name)
   185  	metric, err := metricsexporter.GetCounterMetrics(metricsexporter.NamesDeploymentDeleteCounter)
   186  	if err != nil {
   187  		// log it but continue on with deleting the deployment
   188  		controller.log.Errorf("Failed to get counter metric %s: %v", metricsexporter.NamesDeploymentDeleteCounter, err)
   189  	} else {
   190  		metric.Inc()
   191  	}
   192  	err = controller.kubeclientset.AppsV1().Deployments(vmo.Namespace).Delete(context.TODO(), deployment.Name, metav1.DeleteOptions{})
   193  	if err != nil {
   194  		controller.log.Errorf("Failed to delete deployment %s: %v", deployment.Name, err)
   195  		if metric, metricErr := metricsexporter.GetErrorMetrics(metricsexporter.NamesDeploymentDeleteError); metricErr != nil {
   196  			controller.log.Errorf("Failed to get error metric %s: %v", metricsexporter.NamesDeploymentDeleteError, metricErr)
   197  		} else {
   198  			metric.Inc()
   199  		}
   200  		return err
   201  	}
   202  	return nil
   203  }
   204  
   205  func updateDeployment(controller *Controller, vmo *vmcontrollerv1.VerrazzanoMonitoringInstance, existingDeployment, curDeployment *appsv1.Deployment) error {
   206  	if metric, metricErr := metricsexporter.GetCounterMetrics(metricsexporter.NamesDeploymentUpdateCounter); metricErr != nil {
   207  		controller.log.Errorf("Failed to get error metric %s: %v", metricsexporter.NamesDeploymentUpdateCounter, metricErr)
   208  	} else {
   209  		metric.Inc()
   210  	}
   211  	var err error
   212  	curDeployment.Spec.Selector = existingDeployment.Spec.Selector
   213  	specDiffs := diff.Diff(existingDeployment, curDeployment)
   214  	if specDiffs != "" {
   215  		controller.log.Oncef("Deployment %s/%s has spec differences %s", curDeployment.Namespace, curDeployment.Name, specDiffs)
   216  		controller.log.Oncef("Updating deployment %s/%s", curDeployment.Namespace, curDeployment.Name)
   217  		_, err = controller.kubeclientset.AppsV1().Deployments(vmo.Namespace).Update(context.TODO(), curDeployment, metav1.UpdateOptions{})
   218  	}
   219  
   220  	return err
   221  }
   222  
   223  // Updates the *next* candidate deployment of the given deployments list.  A deployment is a candidate only if
   224  // its predecessors in the list have already been updated and are fully up and running.
   225  // return false if 1) no errors occurred, and 2) no work was done
   226  func rollingUpdate(controller *Controller, vmo *vmcontrollerv1.VerrazzanoMonitoringInstance, deployments []*appsv1.Deployment) (dirty bool, err error) {
   227  	for index, current := range deployments {
   228  		existing, err := controller.deploymentLister.Deployments(vmo.Namespace).Get(current.Name)
   229  		if err != nil {
   230  			return false, err
   231  		}
   232  
   233  		// check if the current node is ready to be updated. If it can't, skip it for the next reconcile
   234  		if !isUpdateAllowed(controller, vmo, current) {
   235  			continue
   236  		}
   237  		metric, metricErr := metricsexporter.GetCounterMetrics(metricsexporter.NamesDeploymentUpdateCounter)
   238  		if metricErr != nil {
   239  			return false, metricErr
   240  		}
   241  		metric.Inc()
   242  		// Selector may not change, so we copy over from existing
   243  		current.Spec.Selector = existing.Spec.Selector
   244  		// Deployment spec differences, so call Update() and return
   245  		specDiffs := diff.Diff(existing, current)
   246  		if specDiffs != "" {
   247  			controller.log.Debugf("Deployment %s : Spec differences %s", current.Name, specDiffs)
   248  			controller.log.Oncef("Updating deployment %s in namespace %s", current.Name, current.Namespace)
   249  			_, err = controller.kubeclientset.AppsV1().Deployments(vmo.Namespace).Update(context.TODO(), current, metav1.UpdateOptions{})
   250  			if err != nil {
   251  				if metric, metricErr := metricsexporter.GetErrorMetrics(metricsexporter.NamesDeploymentUpdateError); err != nil {
   252  					controller.log.Errorf("Failed to get error metric %s: %v", metricsexporter.NamesDeploymentUpdateError, metricErr)
   253  				} else {
   254  					metric.Inc()
   255  				}
   256  				return false, err
   257  			}
   258  			//okay to return dirty=false after updating the *last* deployment
   259  			return index < len(deployments)-1, nil
   260  		}
   261  		// If the (already updated) deployment is not fully up and running, then return
   262  		if existing.Status.Replicas != 1 || existing.Status.Replicas != existing.Status.AvailableReplicas {
   263  			return true, nil
   264  		}
   265  	}
   266  	return false, nil
   267  }
   268  
   269  func updateOpenSearchDeployments(controller *Controller, vmo *vmcontrollerv1.VerrazzanoMonitoringInstance, deployments []*appsv1.Deployment, existingCluster bool) (dirty bool, err error) {
   270  	// if the cluster isn't up, patch all deployments sequentially
   271  	if !existingCluster {
   272  		return updateAllDeployments(controller, vmo, deployments)
   273  	}
   274  	// if the cluster is running, do a rolling update of each deployment
   275  	return rollingUpdate(controller, vmo, deployments)
   276  }
   277  
   278  // Update all deployments in the list concurrently
   279  func updateAllDeployments(controller *Controller, vmo *vmcontrollerv1.VerrazzanoMonitoringInstance, deployments []*appsv1.Deployment) (dirty bool, err error) {
   280  	for _, curDeployment := range deployments {
   281  		_, err := controller.deploymentLister.Deployments(vmo.Namespace).Get(curDeployment.Name)
   282  		if err != nil {
   283  			return false, err
   284  		}
   285  		metric, metricErr := metricsexporter.GetCounterMetrics(metricsexporter.NamesDeploymentUpdateCounter)
   286  		if metricErr != nil {
   287  			return false, metricErr
   288  		}
   289  		metric.Inc()
   290  		controller.log.Oncef("Updating deployment %s in namespace %s", curDeployment.Name, curDeployment.Namespace)
   291  		_, err = controller.kubeclientset.AppsV1().Deployments(vmo.Namespace).Update(context.TODO(), curDeployment, metav1.UpdateOptions{})
   292  		if err != nil {
   293  			if metric, metricErr := metricsexporter.GetErrorMetrics(metricsexporter.NamesDeploymentUpdateError); metricErr != nil {
   294  				controller.log.Errorf("Failed to get error metric %s: %v", metricsexporter.NamesDeploymentUpdateError, metricErr)
   295  			} else {
   296  				metric.Inc()
   297  			}
   298  			return false, err
   299  		}
   300  	}
   301  	return false, nil
   302  }
   303  
   304  // isUpdateAllowed checks if OpenSearch nodes are allowed to update. If a data node is removed when the cluster is yellow,
   305  // data loss may occur.
   306  func isUpdateAllowed(controller *Controller, vmo *vmcontrollerv1.VerrazzanoMonitoringInstance, current *appsv1.Deployment) bool {
   307  	// if current is an OpenSearch data node
   308  	if deployments.IsOpenSearchDataDeployment(vmo.Namespace, current) {
   309  		// if the node is down, we should try to fix it
   310  		if current.Status.ReadyReplicas == 0 {
   311  			return true
   312  		}
   313  
   314  		// if the node is running, we shouldn't take it down unless the cluster is green (to avoid data loss)
   315  		if err := controller.osClient.IsGreen(vmo); err != nil {
   316  			controller.log.Oncef("OpenSearch node %s was not upgraded, since the cluster is not ready", current.Name)
   317  			return false
   318  		}
   319  	}
   320  	return true
   321  }