github.com/verrazzano/verrazzano@v1.7.1/tools/psr/backend/workers/opensearch/restart/restart.go (about)

     1  // Copyright (c) 2022, Oracle and/or its affiliates.
     2  // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
     3  
     4  package restart
     5  
     6  import (
     7  	"context"
     8  	"crypto/rand"
     9  	"fmt"
    10  	"github.com/verrazzano/verrazzano/pkg/constants"
    11  	"github.com/verrazzano/verrazzano/pkg/k8s/ready"
    12  	"github.com/verrazzano/verrazzano/tools/psr/backend/pkg/k8sclient"
    13  	psropensearch "github.com/verrazzano/verrazzano/tools/psr/backend/pkg/opensearch"
    14  	"k8s.io/apimachinery/pkg/labels"
    15  	"k8s.io/apimachinery/pkg/selection"
    16  	"k8s.io/apimachinery/pkg/types"
    17  	"math/big"
    18  	"sigs.k8s.io/controller-runtime/pkg/client"
    19  	"sync/atomic"
    20  	"time"
    21  
    22  	"github.com/prometheus/client_golang/prometheus"
    23  	"github.com/verrazzano/verrazzano/pkg/log/vzlog"
    24  	"github.com/verrazzano/verrazzano/tools/psr/backend/config"
    25  	"github.com/verrazzano/verrazzano/tools/psr/backend/metrics"
    26  	"github.com/verrazzano/verrazzano/tools/psr/backend/osenv"
    27  	"github.com/verrazzano/verrazzano/tools/psr/backend/spi"
    28  )
    29  
    30  const (
    31  	// metricsPrefix is the prefix that is automatically pre-pended to all metrics exported by this worker.
    32  	metricsPrefix            = "opensearch_restart"
    33  	openSearchTier           = "OPENSEARCH_TIER"
    34  	openSearchTierMetricName = "opensearch_tier"
    35  )
    36  
    37  var funcNewPsrClient = k8sclient.NewPsrClient
    38  
    39  type worker struct {
    40  	metricDescList []prometheus.Desc
    41  	*workerMetrics
    42  	psrClient k8sclient.PsrClient
    43  	log       vzlog.VerrazzanoLogger
    44  	*restartData
    45  }
    46  
    47  type restartData struct {
    48  	restartStartTime int64
    49  	restartedPodUID  types.UID
    50  }
    51  
    52  var _ spi.Worker = worker{}
    53  
    54  // restartMetrics holds the metrics produced by the worker. Metrics must be thread safe.
    55  type workerMetrics struct {
    56  	restartCount metrics.MetricItem
    57  	restartTime  metrics.MetricItem
    58  }
    59  
    60  func NewRestartWorker() (spi.Worker, error) {
    61  	c, err := funcNewPsrClient()
    62  	if err != nil {
    63  		return nil, err
    64  	}
    65  	w := worker{
    66  		psrClient:   c,
    67  		log:         vzlog.DefaultLogger(),
    68  		restartData: &restartData{},
    69  		workerMetrics: &workerMetrics{
    70  			restartCount: metrics.MetricItem{
    71  				Name: "pod_restart_count",
    72  				Help: "The total number of OpenSearch pod restarts",
    73  				Type: prometheus.CounterValue,
    74  			},
    75  			restartTime: metrics.MetricItem{
    76  				Name: "pod_restart_time_nanoseconds",
    77  				Help: "The number of nanoseconds elapsed to restart the OpenSearch pod",
    78  				Type: prometheus.GaugeValue,
    79  			},
    80  		},
    81  	}
    82  
    83  	if err = config.PsrEnv.LoadFromEnv(w.GetEnvDescList()); err != nil {
    84  		return w, err
    85  	}
    86  
    87  	tier, err := psropensearch.ValidateOpenSeachTier(openSearchTier)
    88  	if err != nil {
    89  		return w, err
    90  	}
    91  
    92  	metricsLabels := map[string]string{
    93  		openSearchTierMetricName:        tier,
    94  		config.PsrWorkerTypeMetricsName: config.PsrEnv.GetEnv(config.PsrWorkerType),
    95  	}
    96  
    97  	w.metricDescList = metrics.BuildMetricDescList([]*metrics.MetricItem{
    98  		&w.restartCount,
    99  		&w.restartTime,
   100  	}, metricsLabels, w.GetWorkerDesc().MetricsPrefix)
   101  
   102  	return w, nil
   103  }
   104  
   105  // GetWorkerDesc returns the WorkerDesc for the worker
   106  func (w worker) GetWorkerDesc() spi.WorkerDesc {
   107  	return spi.WorkerDesc{
   108  		WorkerType:    config.WorkerTypeOpsRestart,
   109  		Description:   "Worker to restart pods in the specified OpenSearch tier",
   110  		MetricsPrefix: metricsPrefix,
   111  	}
   112  }
   113  
   114  func (w worker) GetEnvDescList() []osenv.EnvVarDesc {
   115  	return []osenv.EnvVarDesc{
   116  		{Key: openSearchTier, DefaultVal: "", Required: true},
   117  	}
   118  }
   119  
   120  func (w worker) GetMetricDescList() []prometheus.Desc {
   121  	return w.metricDescList
   122  }
   123  
   124  func (w worker) GetMetricList() []prometheus.Metric {
   125  	return []prometheus.Metric{
   126  		w.restartCount.BuildMetric(),
   127  		w.restartTime.BuildMetric(),
   128  	}
   129  }
   130  
   131  func (w worker) WantLoopInfoLogged() bool {
   132  	return false
   133  }
   134  
   135  // DoWork restarts a pod in the specified OpenSearch tier
   136  func (w worker) DoWork(_ config.CommonConfig, log vzlog.VerrazzanoLogger) error {
   137  	// validate OS tier
   138  	tier, err := psropensearch.ValidateOpenSeachTier(openSearchTier)
   139  	if err != nil {
   140  		return err
   141  	}
   142  
   143  	// Wait for restarted pod to be ready
   144  	if err = w.podsReady(tier); err != nil {
   145  		return err
   146  	}
   147  
   148  	// Update the elapsed time of the restart operation
   149  	if w.restartStartTime > 0 {
   150  		atomic.StoreInt64(&w.workerMetrics.restartTime.Val, time.Now().UnixNano()-w.restartStartTime)
   151  	}
   152  
   153  	w.restartStartTime = time.Now().UnixNano()
   154  	if err = w.restartPod(tier); err != nil {
   155  		// reset restartStartTime to 0 so we don't emit a bogus metric on the next time through
   156  		w.restartStartTime = 0
   157  		return err
   158  	}
   159  	atomic.AddInt64(&w.workerMetrics.restartCount.Val, 1)
   160  
   161  	return nil
   162  }
   163  
   164  func (w worker) PreconditionsMet() (bool, error) {
   165  	return true, nil
   166  }
   167  
   168  func (w worker) podsReady(tier string) error {
   169  	var label string
   170  	var err error
   171  	switch tier {
   172  	case psropensearch.MasterTier:
   173  		//err = ready.StatefulSetsAreAvailable(w.psrClient.CrtlRuntime, []types.NamespacedName{{
   174  		//	Name:      "vmi-system-es-master",
   175  		//	Namespace: constants.VerrazzanoSystemNamespace,
   176  		//}})
   177  
   178  		// there's no opensearch.verrazzano.io/role-master label on the master statefulset
   179  		// however master is the only tier that's deployed as a statefulset, so any opensearch sts must be master
   180  		label = "verrazzano-component"
   181  		err = ready.StatefulSetsAreAvailableBySelector(w.psrClient.CrtlRuntime, getSelectortForLabel(label, "opensearch"))
   182  	case psropensearch.DataTier:
   183  		label = "opensearch.verrazzano.io/role-data"
   184  		err = ready.DeploymentsAreAvailableBySelector(w.psrClient.CrtlRuntime, getSelectortForLabel(label, "true"))
   185  	case psropensearch.IngestTier:
   186  		label = "opensearch.verrazzano.io/role-ingest"
   187  		err = ready.DeploymentsAreAvailableBySelector(w.psrClient.CrtlRuntime, getSelectortForLabel(label, "true"))
   188  	}
   189  	if err != nil {
   190  		return err
   191  	}
   192  	pods, err := psropensearch.GetPodsForTier(w.psrClient.CrtlRuntime, tier)
   193  	if err != nil {
   194  		return err
   195  	}
   196  	for _, pod := range pods {
   197  		if pod.GetUID() == w.restartedPodUID {
   198  			return fmt.Errorf("restarted pod still found in cluster, requeuing")
   199  		}
   200  	}
   201  	return nil
   202  }
   203  
   204  func (w worker) restartPod(tier string) error {
   205  	pods, err := psropensearch.GetPodsForTier(w.psrClient.CrtlRuntime, tier)
   206  	if err != nil {
   207  		return err
   208  	}
   209  	if len(pods) == 0 {
   210  		return fmt.Errorf("Failed, no pods found for tier %s", tier)
   211  	}
   212  	i, err := rand.Int(rand.Reader, big.NewInt(int64(len(pods))))
   213  	if err != nil {
   214  		return err
   215  	}
   216  	w.restartedPodUID = pods[i.Int64()].UID
   217  	return w.psrClient.CrtlRuntime.Delete(context.TODO(), &pods[i.Int64()])
   218  }
   219  
   220  func getSelectortForLabel(key, val string) []client.ListOption {
   221  	req, _ := labels.NewRequirement(key, selection.Equals, []string{val})
   222  	selector := labels.NewSelector().Add(*req)
   223  	return []client.ListOption{&client.ListOptions{
   224  		Namespace:     constants.VerrazzanoSystemNamespace,
   225  		LabelSelector: selector,
   226  	}}
   227  }