github.com/verrazzano/verrazzano@v1.7.1/tools/psr/backend/workers/opensearch/restart/restart.go (about) 1 // Copyright (c) 2022, Oracle and/or its affiliates. 2 // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl. 3 4 package restart 5 6 import ( 7 "context" 8 "crypto/rand" 9 "fmt" 10 "github.com/verrazzano/verrazzano/pkg/constants" 11 "github.com/verrazzano/verrazzano/pkg/k8s/ready" 12 "github.com/verrazzano/verrazzano/tools/psr/backend/pkg/k8sclient" 13 psropensearch "github.com/verrazzano/verrazzano/tools/psr/backend/pkg/opensearch" 14 "k8s.io/apimachinery/pkg/labels" 15 "k8s.io/apimachinery/pkg/selection" 16 "k8s.io/apimachinery/pkg/types" 17 "math/big" 18 "sigs.k8s.io/controller-runtime/pkg/client" 19 "sync/atomic" 20 "time" 21 22 "github.com/prometheus/client_golang/prometheus" 23 "github.com/verrazzano/verrazzano/pkg/log/vzlog" 24 "github.com/verrazzano/verrazzano/tools/psr/backend/config" 25 "github.com/verrazzano/verrazzano/tools/psr/backend/metrics" 26 "github.com/verrazzano/verrazzano/tools/psr/backend/osenv" 27 "github.com/verrazzano/verrazzano/tools/psr/backend/spi" 28 ) 29 30 const ( 31 // metricsPrefix is the prefix that is automatically pre-pended to all metrics exported by this worker. 32 metricsPrefix = "opensearch_restart" 33 openSearchTier = "OPENSEARCH_TIER" 34 openSearchTierMetricName = "opensearch_tier" 35 ) 36 37 var funcNewPsrClient = k8sclient.NewPsrClient 38 39 type worker struct { 40 metricDescList []prometheus.Desc 41 *workerMetrics 42 psrClient k8sclient.PsrClient 43 log vzlog.VerrazzanoLogger 44 *restartData 45 } 46 47 type restartData struct { 48 restartStartTime int64 49 restartedPodUID types.UID 50 } 51 52 var _ spi.Worker = worker{} 53 54 // restartMetrics holds the metrics produced by the worker. Metrics must be thread safe. 55 type workerMetrics struct { 56 restartCount metrics.MetricItem 57 restartTime metrics.MetricItem 58 } 59 60 func NewRestartWorker() (spi.Worker, error) { 61 c, err := funcNewPsrClient() 62 if err != nil { 63 return nil, err 64 } 65 w := worker{ 66 psrClient: c, 67 log: vzlog.DefaultLogger(), 68 restartData: &restartData{}, 69 workerMetrics: &workerMetrics{ 70 restartCount: metrics.MetricItem{ 71 Name: "pod_restart_count", 72 Help: "The total number of OpenSearch pod restarts", 73 Type: prometheus.CounterValue, 74 }, 75 restartTime: metrics.MetricItem{ 76 Name: "pod_restart_time_nanoseconds", 77 Help: "The number of nanoseconds elapsed to restart the OpenSearch pod", 78 Type: prometheus.GaugeValue, 79 }, 80 }, 81 } 82 83 if err = config.PsrEnv.LoadFromEnv(w.GetEnvDescList()); err != nil { 84 return w, err 85 } 86 87 tier, err := psropensearch.ValidateOpenSeachTier(openSearchTier) 88 if err != nil { 89 return w, err 90 } 91 92 metricsLabels := map[string]string{ 93 openSearchTierMetricName: tier, 94 config.PsrWorkerTypeMetricsName: config.PsrEnv.GetEnv(config.PsrWorkerType), 95 } 96 97 w.metricDescList = metrics.BuildMetricDescList([]*metrics.MetricItem{ 98 &w.restartCount, 99 &w.restartTime, 100 }, metricsLabels, w.GetWorkerDesc().MetricsPrefix) 101 102 return w, nil 103 } 104 105 // GetWorkerDesc returns the WorkerDesc for the worker 106 func (w worker) GetWorkerDesc() spi.WorkerDesc { 107 return spi.WorkerDesc{ 108 WorkerType: config.WorkerTypeOpsRestart, 109 Description: "Worker to restart pods in the specified OpenSearch tier", 110 MetricsPrefix: metricsPrefix, 111 } 112 } 113 114 func (w worker) GetEnvDescList() []osenv.EnvVarDesc { 115 return []osenv.EnvVarDesc{ 116 {Key: openSearchTier, DefaultVal: "", Required: true}, 117 } 118 } 119 120 func (w worker) GetMetricDescList() []prometheus.Desc { 121 return w.metricDescList 122 } 123 124 func (w worker) GetMetricList() []prometheus.Metric { 125 return []prometheus.Metric{ 126 w.restartCount.BuildMetric(), 127 w.restartTime.BuildMetric(), 128 } 129 } 130 131 func (w worker) WantLoopInfoLogged() bool { 132 return false 133 } 134 135 // DoWork restarts a pod in the specified OpenSearch tier 136 func (w worker) DoWork(_ config.CommonConfig, log vzlog.VerrazzanoLogger) error { 137 // validate OS tier 138 tier, err := psropensearch.ValidateOpenSeachTier(openSearchTier) 139 if err != nil { 140 return err 141 } 142 143 // Wait for restarted pod to be ready 144 if err = w.podsReady(tier); err != nil { 145 return err 146 } 147 148 // Update the elapsed time of the restart operation 149 if w.restartStartTime > 0 { 150 atomic.StoreInt64(&w.workerMetrics.restartTime.Val, time.Now().UnixNano()-w.restartStartTime) 151 } 152 153 w.restartStartTime = time.Now().UnixNano() 154 if err = w.restartPod(tier); err != nil { 155 // reset restartStartTime to 0 so we don't emit a bogus metric on the next time through 156 w.restartStartTime = 0 157 return err 158 } 159 atomic.AddInt64(&w.workerMetrics.restartCount.Val, 1) 160 161 return nil 162 } 163 164 func (w worker) PreconditionsMet() (bool, error) { 165 return true, nil 166 } 167 168 func (w worker) podsReady(tier string) error { 169 var label string 170 var err error 171 switch tier { 172 case psropensearch.MasterTier: 173 //err = ready.StatefulSetsAreAvailable(w.psrClient.CrtlRuntime, []types.NamespacedName{{ 174 // Name: "vmi-system-es-master", 175 // Namespace: constants.VerrazzanoSystemNamespace, 176 //}}) 177 178 // there's no opensearch.verrazzano.io/role-master label on the master statefulset 179 // however master is the only tier that's deployed as a statefulset, so any opensearch sts must be master 180 label = "verrazzano-component" 181 err = ready.StatefulSetsAreAvailableBySelector(w.psrClient.CrtlRuntime, getSelectortForLabel(label, "opensearch")) 182 case psropensearch.DataTier: 183 label = "opensearch.verrazzano.io/role-data" 184 err = ready.DeploymentsAreAvailableBySelector(w.psrClient.CrtlRuntime, getSelectortForLabel(label, "true")) 185 case psropensearch.IngestTier: 186 label = "opensearch.verrazzano.io/role-ingest" 187 err = ready.DeploymentsAreAvailableBySelector(w.psrClient.CrtlRuntime, getSelectortForLabel(label, "true")) 188 } 189 if err != nil { 190 return err 191 } 192 pods, err := psropensearch.GetPodsForTier(w.psrClient.CrtlRuntime, tier) 193 if err != nil { 194 return err 195 } 196 for _, pod := range pods { 197 if pod.GetUID() == w.restartedPodUID { 198 return fmt.Errorf("restarted pod still found in cluster, requeuing") 199 } 200 } 201 return nil 202 } 203 204 func (w worker) restartPod(tier string) error { 205 pods, err := psropensearch.GetPodsForTier(w.psrClient.CrtlRuntime, tier) 206 if err != nil { 207 return err 208 } 209 if len(pods) == 0 { 210 return fmt.Errorf("Failed, no pods found for tier %s", tier) 211 } 212 i, err := rand.Int(rand.Reader, big.NewInt(int64(len(pods)))) 213 if err != nil { 214 return err 215 } 216 w.restartedPodUID = pods[i.Int64()].UID 217 return w.psrClient.CrtlRuntime.Delete(context.TODO(), &pods[i.Int64()]) 218 } 219 220 func getSelectortForLabel(key, val string) []client.ListOption { 221 req, _ := labels.NewRequirement(key, selection.Equals, []string{val}) 222 selector := labels.NewSelector().Add(*req) 223 return []client.ListOption{&client.ListOptions{ 224 Namespace: constants.VerrazzanoSystemNamespace, 225 LabelSelector: selector, 226 }} 227 }