github.com/verrazzano/verrazzano@v1.7.1/tools/psr/backend/workers/opensearch/scale/scale.go (about) 1 // Copyright (c) 2022, Oracle and/or its affiliates. 2 // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl. 3 4 package scale 5 6 import ( 7 "fmt" 8 "strconv" 9 "sync/atomic" 10 "time" 11 12 "github.com/prometheus/client_golang/prometheus" 13 er "github.com/verrazzano/verrazzano/pkg/controller/errors" 14 "github.com/verrazzano/verrazzano/pkg/log/vzlog" 15 vzv1alpha1 "github.com/verrazzano/verrazzano/platform-operator/apis/verrazzano/v1alpha1" 16 "github.com/verrazzano/verrazzano/tests/e2e/pkg/update" 17 "github.com/verrazzano/verrazzano/tools/psr/backend/config" 18 "github.com/verrazzano/verrazzano/tools/psr/backend/metrics" 19 "github.com/verrazzano/verrazzano/tools/psr/backend/osenv" 20 "github.com/verrazzano/verrazzano/tools/psr/backend/pkg/k8sclient" 21 psropensearch "github.com/verrazzano/verrazzano/tools/psr/backend/pkg/opensearch" 22 psrvz "github.com/verrazzano/verrazzano/tools/psr/backend/pkg/verrazzano" 23 "github.com/verrazzano/verrazzano/tools/psr/backend/spi" 24 ) 25 26 const ( 27 // metricsPrefix is the prefix that is automatically pre-pended to all metrics exported by this worker. 28 metricsPrefix = "opensearch_scaling" 29 30 openSearchTier = "OPENSEARCH_TIER" 31 minReplicaCount = "MIN_REPLICA_COUNT" 32 maxReplicaCount = "MAX_REPLICA_COUNT" 33 openSearchTierMetricName = "opensearch_tier" 34 ) 35 36 var funcNewPsrClient = k8sclient.NewPsrClient 37 38 type worker struct { 39 metricDescList []prometheus.Desc 40 *workerMetrics 41 *state 42 psrClient k8sclient.PsrClient 43 log vzlog.VerrazzanoLogger 44 } 45 46 type state struct { 47 startScaleTime int64 48 directionOut bool 49 } 50 51 var _ spi.Worker = worker{} 52 53 // scaleMetrics holds the metrics produced by the worker. Metrics must be thread safe. 54 type workerMetrics struct { 55 scaleOutCountTotal metrics.MetricItem 56 scaleInCountTotal metrics.MetricItem 57 scaleOutSeconds metrics.MetricItem 58 scaleInSeconds metrics.MetricItem 59 } 60 61 func NewScaleWorker() (spi.Worker, error) { 62 c, err := funcNewPsrClient() 63 if err != nil { 64 return nil, err 65 } 66 w := worker{ 67 psrClient: c, 68 log: vzlog.DefaultLogger(), 69 state: &state{}, 70 workerMetrics: &workerMetrics{ 71 scaleOutCountTotal: metrics.MetricItem{ 72 Name: "scale_out_count_total", 73 Help: "The total number of times OpenSearch scaled out", 74 Type: prometheus.CounterValue, 75 }, 76 scaleInCountTotal: metrics.MetricItem{ 77 Name: "scale_in_count_total", 78 Help: "The total number of times OpenSearch scaled in", 79 Type: prometheus.CounterValue, 80 }, 81 scaleOutSeconds: metrics.MetricItem{ 82 Name: "scale_out_seconds", 83 Help: "The number of seconds elapsed to scale out OpenSearch", 84 Type: prometheus.GaugeValue, 85 }, 86 scaleInSeconds: metrics.MetricItem{ 87 Name: "scale_in_seconds", 88 Help: "The number of seconds elapsed to scale in OpenSearch", 89 Type: prometheus.GaugeValue, 90 }, 91 }, 92 } 93 94 if err = config.PsrEnv.LoadFromEnv(w.GetEnvDescList()); err != nil { 95 return w, err 96 } 97 98 tier, err := psropensearch.ValidateOpenSeachTier(openSearchTier) 99 if err != nil { 100 return w, err 101 } 102 103 metricsLabels := map[string]string{ 104 openSearchTierMetricName: tier, 105 config.PsrWorkerTypeMetricsName: config.PsrEnv.GetEnv(config.PsrWorkerType), 106 } 107 108 w.metricDescList = metrics.BuildMetricDescList([]*metrics.MetricItem{ 109 &w.scaleOutCountTotal, 110 &w.scaleInCountTotal, 111 &w.scaleOutSeconds, 112 &w.scaleInSeconds, 113 }, metricsLabels, w.GetWorkerDesc().MetricsPrefix) 114 115 return w, nil 116 } 117 118 // GetWorkerDesc returns the WorkerDesc for the worker 119 func (w worker) GetWorkerDesc() spi.WorkerDesc { 120 return spi.WorkerDesc{ 121 WorkerType: config.WorkerTypeOpsScale, 122 Description: "The OpenSearch scale worker scales an OpenSearch tier in and out continuously", 123 MetricsPrefix: metricsPrefix, 124 } 125 } 126 127 func (w worker) GetEnvDescList() []osenv.EnvVarDesc { 128 return []osenv.EnvVarDesc{ 129 {Key: openSearchTier, DefaultVal: "", Required: true}, 130 {Key: minReplicaCount, DefaultVal: "3", Required: false}, 131 {Key: maxReplicaCount, DefaultVal: "5", Required: false}, 132 } 133 } 134 135 func (w worker) GetMetricDescList() []prometheus.Desc { 136 return w.metricDescList 137 } 138 139 func (w worker) GetMetricList() []prometheus.Metric { 140 return []prometheus.Metric{ 141 w.scaleInCountTotal.BuildMetric(), 142 w.scaleOutCountTotal.BuildMetric(), 143 w.scaleOutSeconds.BuildMetric(), 144 w.scaleInSeconds.BuildMetric(), 145 } 146 } 147 148 func (w worker) WantLoopInfoLogged() bool { 149 return false 150 } 151 152 func (w worker) PreconditionsMet() (bool, error) { 153 return true, nil 154 } 155 156 // DoWork continuously scales a specified OpenSearch out and in by modifying the VZ CR OpenSearch component 157 func (w worker) DoWork(_ config.CommonConfig, log vzlog.VerrazzanoLogger) error { 158 // validate OS tier 159 tier, err := psropensearch.ValidateOpenSeachTier(openSearchTier) 160 if err != nil { 161 return err 162 } 163 164 // Wait until VZ is ready 165 cr, err := w.waitReady(true) 166 if err != nil { 167 return log.ErrorfNewErr("Failed to wait for Verrazzano to be ready after update. The test results are not valid %v", err) 168 } 169 // Update the elapsed time of the scale operation 170 if w.state.startScaleTime > 0 { 171 elapsedSecs := time.Now().UnixNano() - w.state.startScaleTime 172 if w.state.directionOut { 173 atomic.StoreInt64(&w.workerMetrics.scaleOutSeconds.Val, elapsedSecs) 174 } else { 175 atomic.StoreInt64(&w.workerMetrics.scaleInSeconds.Val, elapsedSecs) 176 } 177 } 178 // Get the current number OpenSearch pods that exist for the given tier 179 pods, err := psropensearch.GetPodsForTier(w.psrClient.CrtlRuntime, tier) 180 if err != nil { 181 return log.ErrorfNewErr("Failed to get the pods for tier %s: %v", tier, err) 182 } 183 existingReplicas := len(pods) 184 if err != nil { 185 return log.ErrorfNewErr("Failed to get the pods for tier %s: %v", tier, err) 186 } 187 if existingReplicas == 0 { 188 return log.ErrorfNewErr("Failed, no pods exist for tier %s", tier) 189 } 190 // Create a modifier that is used to update the Verrazzno CR opensearch replica field 191 m, desiredReplicas, err := w.getUpdateModifier(tier, existingReplicas) 192 if err != nil { 193 return err 194 } 195 log.Infof("Updating Verrazzano CR OpenSearch %s tier, scaling to %v replicas", tier, desiredReplicas) 196 197 // Update metrics 198 if desiredReplicas > len(pods) { 199 w.state.directionOut = true 200 atomic.AddInt64(&w.workerMetrics.scaleOutCountTotal.Val, 1) 201 } else { 202 w.state.directionOut = false 203 atomic.AddInt64(&w.workerMetrics.scaleInCountTotal.Val, 1) 204 } 205 w.state.startScaleTime = time.Now().UnixNano() 206 207 // Update the CR to change the replica count 208 err = w.updateCr(cr, m) 209 if err != nil { 210 return err 211 } 212 213 // Wait until VZ is NOT ready, this means it started working on the change 214 _, err = w.waitReady(false) 215 if err != nil { 216 return log.ErrorfNewErr("Failed to wait for Verrazzano to be NOT ready after update. The test results are not valid %v", err) 217 } 218 219 return nil 220 } 221 222 func (w worker) getUpdateModifier(tier string, currentReplicas int) (update.CRModifier, int, error) { 223 max, err := strconv.ParseInt(config.PsrEnv.GetEnv(maxReplicaCount), 10, 32) 224 if err != nil { 225 return nil, 0, fmt.Errorf("maxReplicaCount can not be parsed to an integer: %v", err) 226 } 227 min, err := strconv.ParseInt(config.PsrEnv.GetEnv(minReplicaCount), 10, 32) 228 if err != nil { 229 return nil, 0, fmt.Errorf("minReplicaCount can not be parsed to an integer: %v", err) 230 } 231 if min < 3 { 232 return nil, 0, fmt.Errorf("minReplicaCount can not be less than 3") 233 } 234 var desiredReplicas int32 235 if currentReplicas != int(min) { 236 desiredReplicas = int32(min) 237 } else { 238 desiredReplicas = int32(max) 239 } 240 241 var m update.CRModifier 242 243 switch tier { 244 case psropensearch.MasterTier: 245 m = psropensearch.OpensearchMasterNodeGroupModifier{NodeReplicas: desiredReplicas} 246 case psropensearch.DataTier: 247 m = psropensearch.OpensearchDataNodeGroupModifier{NodeReplicas: desiredReplicas} 248 case psropensearch.IngestTier: 249 m = psropensearch.OpensearchIngestNodeGroupModifier{NodeReplicas: desiredReplicas} 250 } 251 return m, int(desiredReplicas), nil 252 } 253 254 // updateCr updates the Verrazzano CR and retries if there is a conflict error 255 func (w worker) updateCr(cr *vzv1alpha1.Verrazzano, m update.CRModifier) error { 256 for { 257 // Modify the CR 258 m.ModifyCR(cr) 259 260 err := psrvz.UpdateVerrazzano(w.psrClient.VzInstall, cr) 261 if err == nil { 262 break 263 } 264 if !er.IsUpdateConflict(err) { 265 return fmt.Errorf("Failed to scale update Verrazzano cr: %v", err) 266 } 267 // Conflict error, get latest vz cr 268 time.Sleep(1 * time.Second) 269 w.log.Info("OpenSearch scaling, Verrazzano CR conflict error, retrying") 270 271 cr, err = psrvz.GetVerrazzano(w.psrClient.VzInstall) 272 if err != nil { 273 return err 274 } 275 } 276 w.log.Info("Updated Verrazzano CR") 277 return nil 278 } 279 280 // Wait until Verrazzano is ready or not ready 281 func (w worker) waitReady(desiredReady bool) (cr *vzv1alpha1.Verrazzano, err error) { 282 for { 283 cr, err = psrvz.GetVerrazzano(w.psrClient.VzInstall) 284 if err != nil { 285 return nil, err 286 } 287 ready := psrvz.IsReady(cr) 288 if ready == desiredReady { 289 break 290 } 291 w.log.Progressf("Waiting for Verrazzano CR ready state to be %v", desiredReady) 292 time.Sleep(1 * time.Second) 293 } 294 return cr, err 295 }