github.com/verrazzano/verrazzano@v1.7.1/tools/psr/backend/workers/opensearch/scale/scale.go (about)

     1  // Copyright (c) 2022, Oracle and/or its affiliates.
     2  // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
     3  
     4  package scale
     5  
     6  import (
     7  	"fmt"
     8  	"strconv"
     9  	"sync/atomic"
    10  	"time"
    11  
    12  	"github.com/prometheus/client_golang/prometheus"
    13  	er "github.com/verrazzano/verrazzano/pkg/controller/errors"
    14  	"github.com/verrazzano/verrazzano/pkg/log/vzlog"
    15  	vzv1alpha1 "github.com/verrazzano/verrazzano/platform-operator/apis/verrazzano/v1alpha1"
    16  	"github.com/verrazzano/verrazzano/tests/e2e/pkg/update"
    17  	"github.com/verrazzano/verrazzano/tools/psr/backend/config"
    18  	"github.com/verrazzano/verrazzano/tools/psr/backend/metrics"
    19  	"github.com/verrazzano/verrazzano/tools/psr/backend/osenv"
    20  	"github.com/verrazzano/verrazzano/tools/psr/backend/pkg/k8sclient"
    21  	psropensearch "github.com/verrazzano/verrazzano/tools/psr/backend/pkg/opensearch"
    22  	psrvz "github.com/verrazzano/verrazzano/tools/psr/backend/pkg/verrazzano"
    23  	"github.com/verrazzano/verrazzano/tools/psr/backend/spi"
    24  )
    25  
    26  const (
    27  	// metricsPrefix is the prefix that is automatically pre-pended to all metrics exported by this worker.
    28  	metricsPrefix = "opensearch_scaling"
    29  
    30  	openSearchTier           = "OPENSEARCH_TIER"
    31  	minReplicaCount          = "MIN_REPLICA_COUNT"
    32  	maxReplicaCount          = "MAX_REPLICA_COUNT"
    33  	openSearchTierMetricName = "opensearch_tier"
    34  )
    35  
    36  var funcNewPsrClient = k8sclient.NewPsrClient
    37  
    38  type worker struct {
    39  	metricDescList []prometheus.Desc
    40  	*workerMetrics
    41  	*state
    42  	psrClient k8sclient.PsrClient
    43  	log       vzlog.VerrazzanoLogger
    44  }
    45  
    46  type state struct {
    47  	startScaleTime int64
    48  	directionOut   bool
    49  }
    50  
    51  var _ spi.Worker = worker{}
    52  
    53  // scaleMetrics holds the metrics produced by the worker. Metrics must be thread safe.
    54  type workerMetrics struct {
    55  	scaleOutCountTotal metrics.MetricItem
    56  	scaleInCountTotal  metrics.MetricItem
    57  	scaleOutSeconds    metrics.MetricItem
    58  	scaleInSeconds     metrics.MetricItem
    59  }
    60  
    61  func NewScaleWorker() (spi.Worker, error) {
    62  	c, err := funcNewPsrClient()
    63  	if err != nil {
    64  		return nil, err
    65  	}
    66  	w := worker{
    67  		psrClient: c,
    68  		log:       vzlog.DefaultLogger(),
    69  		state:     &state{},
    70  		workerMetrics: &workerMetrics{
    71  			scaleOutCountTotal: metrics.MetricItem{
    72  				Name: "scale_out_count_total",
    73  				Help: "The total number of times OpenSearch scaled out",
    74  				Type: prometheus.CounterValue,
    75  			},
    76  			scaleInCountTotal: metrics.MetricItem{
    77  				Name: "scale_in_count_total",
    78  				Help: "The total number of times OpenSearch scaled in",
    79  				Type: prometheus.CounterValue,
    80  			},
    81  			scaleOutSeconds: metrics.MetricItem{
    82  				Name: "scale_out_seconds",
    83  				Help: "The number of seconds elapsed to scale out OpenSearch",
    84  				Type: prometheus.GaugeValue,
    85  			},
    86  			scaleInSeconds: metrics.MetricItem{
    87  				Name: "scale_in_seconds",
    88  				Help: "The number of seconds elapsed to scale in OpenSearch",
    89  				Type: prometheus.GaugeValue,
    90  			},
    91  		},
    92  	}
    93  
    94  	if err = config.PsrEnv.LoadFromEnv(w.GetEnvDescList()); err != nil {
    95  		return w, err
    96  	}
    97  
    98  	tier, err := psropensearch.ValidateOpenSeachTier(openSearchTier)
    99  	if err != nil {
   100  		return w, err
   101  	}
   102  
   103  	metricsLabels := map[string]string{
   104  		openSearchTierMetricName:        tier,
   105  		config.PsrWorkerTypeMetricsName: config.PsrEnv.GetEnv(config.PsrWorkerType),
   106  	}
   107  
   108  	w.metricDescList = metrics.BuildMetricDescList([]*metrics.MetricItem{
   109  		&w.scaleOutCountTotal,
   110  		&w.scaleInCountTotal,
   111  		&w.scaleOutSeconds,
   112  		&w.scaleInSeconds,
   113  	}, metricsLabels, w.GetWorkerDesc().MetricsPrefix)
   114  
   115  	return w, nil
   116  }
   117  
   118  // GetWorkerDesc returns the WorkerDesc for the worker
   119  func (w worker) GetWorkerDesc() spi.WorkerDesc {
   120  	return spi.WorkerDesc{
   121  		WorkerType:    config.WorkerTypeOpsScale,
   122  		Description:   "The OpenSearch scale worker scales an OpenSearch tier in and out continuously",
   123  		MetricsPrefix: metricsPrefix,
   124  	}
   125  }
   126  
   127  func (w worker) GetEnvDescList() []osenv.EnvVarDesc {
   128  	return []osenv.EnvVarDesc{
   129  		{Key: openSearchTier, DefaultVal: "", Required: true},
   130  		{Key: minReplicaCount, DefaultVal: "3", Required: false},
   131  		{Key: maxReplicaCount, DefaultVal: "5", Required: false},
   132  	}
   133  }
   134  
   135  func (w worker) GetMetricDescList() []prometheus.Desc {
   136  	return w.metricDescList
   137  }
   138  
   139  func (w worker) GetMetricList() []prometheus.Metric {
   140  	return []prometheus.Metric{
   141  		w.scaleInCountTotal.BuildMetric(),
   142  		w.scaleOutCountTotal.BuildMetric(),
   143  		w.scaleOutSeconds.BuildMetric(),
   144  		w.scaleInSeconds.BuildMetric(),
   145  	}
   146  }
   147  
   148  func (w worker) WantLoopInfoLogged() bool {
   149  	return false
   150  }
   151  
   152  func (w worker) PreconditionsMet() (bool, error) {
   153  	return true, nil
   154  }
   155  
   156  // DoWork continuously scales a specified OpenSearch out and in by modifying the VZ CR OpenSearch component
   157  func (w worker) DoWork(_ config.CommonConfig, log vzlog.VerrazzanoLogger) error {
   158  	// validate OS tier
   159  	tier, err := psropensearch.ValidateOpenSeachTier(openSearchTier)
   160  	if err != nil {
   161  		return err
   162  	}
   163  
   164  	// Wait until VZ is ready
   165  	cr, err := w.waitReady(true)
   166  	if err != nil {
   167  		return log.ErrorfNewErr("Failed to wait for Verrazzano to be ready after update.  The test results are not valid %v", err)
   168  	}
   169  	// Update the elapsed time of the scale operation
   170  	if w.state.startScaleTime > 0 {
   171  		elapsedSecs := time.Now().UnixNano() - w.state.startScaleTime
   172  		if w.state.directionOut {
   173  			atomic.StoreInt64(&w.workerMetrics.scaleOutSeconds.Val, elapsedSecs)
   174  		} else {
   175  			atomic.StoreInt64(&w.workerMetrics.scaleInSeconds.Val, elapsedSecs)
   176  		}
   177  	}
   178  	// Get the current number OpenSearch pods that exist for the given tier
   179  	pods, err := psropensearch.GetPodsForTier(w.psrClient.CrtlRuntime, tier)
   180  	if err != nil {
   181  		return log.ErrorfNewErr("Failed to get the pods for tier %s: %v", tier, err)
   182  	}
   183  	existingReplicas := len(pods)
   184  	if err != nil {
   185  		return log.ErrorfNewErr("Failed to get the pods for tier %s: %v", tier, err)
   186  	}
   187  	if existingReplicas == 0 {
   188  		return log.ErrorfNewErr("Failed, no pods exist for tier %s", tier)
   189  	}
   190  	// Create a modifier that is used to update the Verrazzno CR opensearch replica field
   191  	m, desiredReplicas, err := w.getUpdateModifier(tier, existingReplicas)
   192  	if err != nil {
   193  		return err
   194  	}
   195  	log.Infof("Updating Verrazzano CR OpenSearch %s tier, scaling to %v replicas", tier, desiredReplicas)
   196  
   197  	// Update metrics
   198  	if desiredReplicas > len(pods) {
   199  		w.state.directionOut = true
   200  		atomic.AddInt64(&w.workerMetrics.scaleOutCountTotal.Val, 1)
   201  	} else {
   202  		w.state.directionOut = false
   203  		atomic.AddInt64(&w.workerMetrics.scaleInCountTotal.Val, 1)
   204  	}
   205  	w.state.startScaleTime = time.Now().UnixNano()
   206  
   207  	// Update the CR to change the replica count
   208  	err = w.updateCr(cr, m)
   209  	if err != nil {
   210  		return err
   211  	}
   212  
   213  	// Wait until VZ is NOT ready, this means it started working on the change
   214  	_, err = w.waitReady(false)
   215  	if err != nil {
   216  		return log.ErrorfNewErr("Failed to wait for Verrazzano to be NOT ready after update.  The test results are not valid %v", err)
   217  	}
   218  
   219  	return nil
   220  }
   221  
   222  func (w worker) getUpdateModifier(tier string, currentReplicas int) (update.CRModifier, int, error) {
   223  	max, err := strconv.ParseInt(config.PsrEnv.GetEnv(maxReplicaCount), 10, 32)
   224  	if err != nil {
   225  		return nil, 0, fmt.Errorf("maxReplicaCount can not be parsed to an integer: %v", err)
   226  	}
   227  	min, err := strconv.ParseInt(config.PsrEnv.GetEnv(minReplicaCount), 10, 32)
   228  	if err != nil {
   229  		return nil, 0, fmt.Errorf("minReplicaCount can not be parsed to an integer: %v", err)
   230  	}
   231  	if min < 3 {
   232  		return nil, 0, fmt.Errorf("minReplicaCount can not be less than 3")
   233  	}
   234  	var desiredReplicas int32
   235  	if currentReplicas != int(min) {
   236  		desiredReplicas = int32(min)
   237  	} else {
   238  		desiredReplicas = int32(max)
   239  	}
   240  
   241  	var m update.CRModifier
   242  
   243  	switch tier {
   244  	case psropensearch.MasterTier:
   245  		m = psropensearch.OpensearchMasterNodeGroupModifier{NodeReplicas: desiredReplicas}
   246  	case psropensearch.DataTier:
   247  		m = psropensearch.OpensearchDataNodeGroupModifier{NodeReplicas: desiredReplicas}
   248  	case psropensearch.IngestTier:
   249  		m = psropensearch.OpensearchIngestNodeGroupModifier{NodeReplicas: desiredReplicas}
   250  	}
   251  	return m, int(desiredReplicas), nil
   252  }
   253  
   254  // updateCr updates the Verrazzano CR and retries if there is a conflict error
   255  func (w worker) updateCr(cr *vzv1alpha1.Verrazzano, m update.CRModifier) error {
   256  	for {
   257  		// Modify the CR
   258  		m.ModifyCR(cr)
   259  
   260  		err := psrvz.UpdateVerrazzano(w.psrClient.VzInstall, cr)
   261  		if err == nil {
   262  			break
   263  		}
   264  		if !er.IsUpdateConflict(err) {
   265  			return fmt.Errorf("Failed to scale update Verrazzano cr: %v", err)
   266  		}
   267  		// Conflict error, get latest vz cr
   268  		time.Sleep(1 * time.Second)
   269  		w.log.Info("OpenSearch scaling, Verrazzano CR conflict error, retrying")
   270  
   271  		cr, err = psrvz.GetVerrazzano(w.psrClient.VzInstall)
   272  		if err != nil {
   273  			return err
   274  		}
   275  	}
   276  	w.log.Info("Updated Verrazzano CR")
   277  	return nil
   278  }
   279  
   280  // Wait until Verrazzano is ready or not ready
   281  func (w worker) waitReady(desiredReady bool) (cr *vzv1alpha1.Verrazzano, err error) {
   282  	for {
   283  		cr, err = psrvz.GetVerrazzano(w.psrClient.VzInstall)
   284  		if err != nil {
   285  			return nil, err
   286  		}
   287  		ready := psrvz.IsReady(cr)
   288  		if ready == desiredReady {
   289  			break
   290  		}
   291  		w.log.Progressf("Waiting for Verrazzano CR ready state to be %v", desiredReady)
   292  		time.Sleep(1 * time.Second)
   293  	}
   294  	return cr, err
   295  }