github.com/verrazzano/verrazzano@v1.7.1/tools/psr/backend/workmanager/runner.go (about)

     1  // Copyright (c) 2022, Oracle and/or its affiliates.
     2  // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
     3  
     4  package workmanager
     5  
     6  import (
     7  	"crypto/rand"
     8  	"math/big"
     9  	"sync/atomic"
    10  	"time"
    11  
    12  	"github.com/prometheus/client_golang/prometheus"
    13  	"github.com/verrazzano/verrazzano/pkg/log/vzlog"
    14  	"github.com/verrazzano/verrazzano/tools/psr/backend/config"
    15  	"github.com/verrazzano/verrazzano/tools/psr/backend/metrics"
    16  	"github.com/verrazzano/verrazzano/tools/psr/backend/spi"
    17  )
    18  
    19  // WorkerRunner interface specifies a workerRunner that loops calling a worker
    20  type WorkerRunner interface {
    21  	// RunWorker runs the worker use case in a loop
    22  	RunWorker(config.CommonConfig, vzlog.VerrazzanoLogger) error
    23  
    24  	// WorkerMetricsProvider is an interface to get prometheus metrics information for the worker to do work
    25  	spi.WorkerMetricsProvider
    26  }
    27  
    28  // workerRunner is needed to run the worker
    29  type workerRunner struct {
    30  	spi.Worker
    31  	metricDescList []prometheus.Desc
    32  	*runnerMetrics
    33  	prevWorkFailed bool
    34  }
    35  
    36  var _ WorkerRunner = workerRunner{}
    37  
    38  // runnerMetrics holds the metrics produced by the workerRunner. Metrics must be thread safe.
    39  type runnerMetrics struct {
    40  	loopCount                  metrics.MetricItem
    41  	workerThreadCount          metrics.MetricItem
    42  	workerLoopNanoSeconds      metrics.MetricItem
    43  	workerDurationTotalSeconds metrics.MetricItem
    44  }
    45  
    46  // NewRunner creates a new workerRunner
    47  func NewRunner(worker spi.Worker, conf config.CommonConfig, log vzlog.VerrazzanoLogger) (WorkerRunner, error) {
    48  	r := workerRunner{Worker: worker, runnerMetrics: &runnerMetrics{
    49  		loopCount: metrics.MetricItem{
    50  			Name: "loop_count_total",
    51  			Help: "The total number of loops executed",
    52  			Type: prometheus.CounterValue,
    53  		},
    54  		workerThreadCount: metrics.MetricItem{
    55  			Name: "worker_thread_count_total",
    56  			Help: "The total number of worker threads (goroutines) running",
    57  			Type: prometheus.CounterValue,
    58  		},
    59  		workerLoopNanoSeconds: metrics.MetricItem{
    60  			Name: "worker_last_loop_nanoseconds",
    61  			Help: "The number of nanoseconds that the worker took to run the last loop of doing work",
    62  			Type: prometheus.GaugeValue,
    63  		},
    64  		workerDurationTotalSeconds: metrics.MetricItem{
    65  			Name: "worker_running_seconds_total",
    66  			Help: "The total number of seconds that the worker has been running",
    67  			Type: prometheus.CounterValue,
    68  		},
    69  	}}
    70  
    71  	r.metricDescList = []prometheus.Desc{
    72  		*r.loopCount.BuildMetricDesc(r.GetWorkerDesc().MetricsPrefix),
    73  		*r.workerThreadCount.BuildMetricDesc(r.GetWorkerDesc().MetricsPrefix),
    74  		*r.workerLoopNanoSeconds.BuildMetricDesc(r.GetWorkerDesc().MetricsPrefix),
    75  		*r.workerDurationTotalSeconds.BuildMetricDesc(r.GetWorkerDesc().MetricsPrefix),
    76  	}
    77  
    78  	return r, nil
    79  }
    80  
    81  // GetMetricDescList returns the prometheus metrics descriptors for the worker metrics.  Must be thread safe
    82  func (r workerRunner) GetMetricDescList() []prometheus.Desc {
    83  	return r.metricDescList
    84  }
    85  
    86  // GetMetricList returns the realtime metrics for the worker.  Must be thread safe
    87  func (r workerRunner) GetMetricList() []prometheus.Metric {
    88  	return []prometheus.Metric{
    89  		r.loopCount.BuildMetric(),
    90  		r.workerThreadCount.BuildMetric(),
    91  		r.workerLoopNanoSeconds.BuildMetric(),
    92  		r.workerDurationTotalSeconds.BuildMetric(),
    93  	}
    94  }
    95  
    96  // RunWorker runs the worker in a loop
    97  func (r workerRunner) RunWorker(conf config.CommonConfig, log vzlog.VerrazzanoLogger) error {
    98  	if conf.NumLoops == 0 {
    99  		return nil
   100  	}
   101  
   102  	r.incThreadCount()
   103  
   104  	// sleep before calling DoWork the first time
   105  	if err := sleepWithJitters(time.Millisecond * 100); err != nil {
   106  		return err
   107  	}
   108  
   109  	startTimeSecs := time.Now().Unix()
   110  	for {
   111  		loopCount := atomic.AddInt64(&r.runnerMetrics.loopCount.Val, 1)
   112  
   113  		// call the wrapped worker.  Log any error but keep working
   114  		startLoop := time.Now().UnixNano()
   115  		err := r.Worker.DoWork(conf, log)
   116  		if err != nil {
   117  			r.prevWorkFailed = true
   118  			log.ErrorfThrottled("Failed calling %s to do work: %v", r.Worker.GetWorkerDesc().WorkerType, err)
   119  		} else {
   120  			if r.prevWorkFailed {
   121  				// If we had a failure on the prev call then log success, so you can tell
   122  				// get is working just be looking at the pod log.
   123  				log.Info("Next call to DoWork from workerRunner successful after previous DoWork failed")
   124  			}
   125  			if loopCount == 1 {
   126  				log.Info("First call to DoWork succeeded")
   127  			}
   128  			r.prevWorkFailed = false
   129  		}
   130  		log.GetZapLogger().Sync()
   131  
   132  		durationSecondsTotal := time.Now().Unix() - startTimeSecs
   133  		atomic.StoreInt64(&r.runnerMetrics.workerLoopNanoSeconds.Val, time.Now().UnixNano()-startLoop)
   134  		atomic.StoreInt64(&r.runnerMetrics.workerDurationTotalSeconds.Val, durationSecondsTotal)
   135  		if r.Worker.WantLoopInfoLogged() {
   136  			log.Infof("Loop Count: %v, Total seconds from start of the first worker loop until now: %v", loopCount, durationSecondsTotal)
   137  		}
   138  		if time.Duration(durationSecondsTotal) >= conf.PsrDuration*time.Second && conf.PsrDuration != config.UnlimitedWorkerDuration {
   139  			log.Infof("Worker has reached its run duration of %s", conf.PsrDuration)
   140  			return nil
   141  		}
   142  		if loopCount == conf.NumLoops && conf.NumLoops != config.UnlimitedWorkerLoops {
   143  			log.Infof("Worker has reached its number of %s loops", conf.NumLoops)
   144  			return nil
   145  		}
   146  		if err = sleepWithJitters(conf.LoopSleepNanos); err != nil {
   147  			return err
   148  		}
   149  	}
   150  }
   151  
   152  func (r workerRunner) incThreadCount() {
   153  	atomic.AddInt64(&r.runnerMetrics.workerThreadCount.Val, 1)
   154  }
   155  
   156  func sleepWithJitters(duration time.Duration) error {
   157  	num, err := rand.Int(rand.Reader, big.NewInt(int64(20)))
   158  	if err != nil {
   159  		return err
   160  	}
   161  	// add between a -10% amd 10% jitter
   162  	jitter := (duration.Nanoseconds() / 100) * (num.Int64() - 10)
   163  	time.Sleep(duration + time.Duration(jitter))
   164  	return nil
   165  }