github.com/verrazzano/verrazzano@v1.7.1/tools/psr/backend/workmanager/runner.go (about) 1 // Copyright (c) 2022, Oracle and/or its affiliates. 2 // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl. 3 4 package workmanager 5 6 import ( 7 "crypto/rand" 8 "math/big" 9 "sync/atomic" 10 "time" 11 12 "github.com/prometheus/client_golang/prometheus" 13 "github.com/verrazzano/verrazzano/pkg/log/vzlog" 14 "github.com/verrazzano/verrazzano/tools/psr/backend/config" 15 "github.com/verrazzano/verrazzano/tools/psr/backend/metrics" 16 "github.com/verrazzano/verrazzano/tools/psr/backend/spi" 17 ) 18 19 // WorkerRunner interface specifies a workerRunner that loops calling a worker 20 type WorkerRunner interface { 21 // RunWorker runs the worker use case in a loop 22 RunWorker(config.CommonConfig, vzlog.VerrazzanoLogger) error 23 24 // WorkerMetricsProvider is an interface to get prometheus metrics information for the worker to do work 25 spi.WorkerMetricsProvider 26 } 27 28 // workerRunner is needed to run the worker 29 type workerRunner struct { 30 spi.Worker 31 metricDescList []prometheus.Desc 32 *runnerMetrics 33 prevWorkFailed bool 34 } 35 36 var _ WorkerRunner = workerRunner{} 37 38 // runnerMetrics holds the metrics produced by the workerRunner. Metrics must be thread safe. 39 type runnerMetrics struct { 40 loopCount metrics.MetricItem 41 workerThreadCount metrics.MetricItem 42 workerLoopNanoSeconds metrics.MetricItem 43 workerDurationTotalSeconds metrics.MetricItem 44 } 45 46 // NewRunner creates a new workerRunner 47 func NewRunner(worker spi.Worker, conf config.CommonConfig, log vzlog.VerrazzanoLogger) (WorkerRunner, error) { 48 r := workerRunner{Worker: worker, runnerMetrics: &runnerMetrics{ 49 loopCount: metrics.MetricItem{ 50 Name: "loop_count_total", 51 Help: "The total number of loops executed", 52 Type: prometheus.CounterValue, 53 }, 54 workerThreadCount: metrics.MetricItem{ 55 Name: "worker_thread_count_total", 56 Help: "The total number of worker threads (goroutines) running", 57 Type: prometheus.CounterValue, 58 }, 59 workerLoopNanoSeconds: metrics.MetricItem{ 60 Name: "worker_last_loop_nanoseconds", 61 Help: "The number of nanoseconds that the worker took to run the last loop of doing work", 62 Type: prometheus.GaugeValue, 63 }, 64 workerDurationTotalSeconds: metrics.MetricItem{ 65 Name: "worker_running_seconds_total", 66 Help: "The total number of seconds that the worker has been running", 67 Type: prometheus.CounterValue, 68 }, 69 }} 70 71 r.metricDescList = []prometheus.Desc{ 72 *r.loopCount.BuildMetricDesc(r.GetWorkerDesc().MetricsPrefix), 73 *r.workerThreadCount.BuildMetricDesc(r.GetWorkerDesc().MetricsPrefix), 74 *r.workerLoopNanoSeconds.BuildMetricDesc(r.GetWorkerDesc().MetricsPrefix), 75 *r.workerDurationTotalSeconds.BuildMetricDesc(r.GetWorkerDesc().MetricsPrefix), 76 } 77 78 return r, nil 79 } 80 81 // GetMetricDescList returns the prometheus metrics descriptors for the worker metrics. Must be thread safe 82 func (r workerRunner) GetMetricDescList() []prometheus.Desc { 83 return r.metricDescList 84 } 85 86 // GetMetricList returns the realtime metrics for the worker. Must be thread safe 87 func (r workerRunner) GetMetricList() []prometheus.Metric { 88 return []prometheus.Metric{ 89 r.loopCount.BuildMetric(), 90 r.workerThreadCount.BuildMetric(), 91 r.workerLoopNanoSeconds.BuildMetric(), 92 r.workerDurationTotalSeconds.BuildMetric(), 93 } 94 } 95 96 // RunWorker runs the worker in a loop 97 func (r workerRunner) RunWorker(conf config.CommonConfig, log vzlog.VerrazzanoLogger) error { 98 if conf.NumLoops == 0 { 99 return nil 100 } 101 102 r.incThreadCount() 103 104 // sleep before calling DoWork the first time 105 if err := sleepWithJitters(time.Millisecond * 100); err != nil { 106 return err 107 } 108 109 startTimeSecs := time.Now().Unix() 110 for { 111 loopCount := atomic.AddInt64(&r.runnerMetrics.loopCount.Val, 1) 112 113 // call the wrapped worker. Log any error but keep working 114 startLoop := time.Now().UnixNano() 115 err := r.Worker.DoWork(conf, log) 116 if err != nil { 117 r.prevWorkFailed = true 118 log.ErrorfThrottled("Failed calling %s to do work: %v", r.Worker.GetWorkerDesc().WorkerType, err) 119 } else { 120 if r.prevWorkFailed { 121 // If we had a failure on the prev call then log success, so you can tell 122 // get is working just be looking at the pod log. 123 log.Info("Next call to DoWork from workerRunner successful after previous DoWork failed") 124 } 125 if loopCount == 1 { 126 log.Info("First call to DoWork succeeded") 127 } 128 r.prevWorkFailed = false 129 } 130 log.GetZapLogger().Sync() 131 132 durationSecondsTotal := time.Now().Unix() - startTimeSecs 133 atomic.StoreInt64(&r.runnerMetrics.workerLoopNanoSeconds.Val, time.Now().UnixNano()-startLoop) 134 atomic.StoreInt64(&r.runnerMetrics.workerDurationTotalSeconds.Val, durationSecondsTotal) 135 if r.Worker.WantLoopInfoLogged() { 136 log.Infof("Loop Count: %v, Total seconds from start of the first worker loop until now: %v", loopCount, durationSecondsTotal) 137 } 138 if time.Duration(durationSecondsTotal) >= conf.PsrDuration*time.Second && conf.PsrDuration != config.UnlimitedWorkerDuration { 139 log.Infof("Worker has reached its run duration of %s", conf.PsrDuration) 140 return nil 141 } 142 if loopCount == conf.NumLoops && conf.NumLoops != config.UnlimitedWorkerLoops { 143 log.Infof("Worker has reached its number of %s loops", conf.NumLoops) 144 return nil 145 } 146 if err = sleepWithJitters(conf.LoopSleepNanos); err != nil { 147 return err 148 } 149 } 150 } 151 152 func (r workerRunner) incThreadCount() { 153 atomic.AddInt64(&r.runnerMetrics.workerThreadCount.Val, 1) 154 } 155 156 func sleepWithJitters(duration time.Duration) error { 157 num, err := rand.Int(rand.Reader, big.NewInt(int64(20))) 158 if err != nil { 159 return err 160 } 161 // add between a -10% amd 10% jitter 162 jitter := (duration.Nanoseconds() / 100) * (num.Int64() - 10) 163 time.Sleep(duration + time.Duration(jitter)) 164 return nil 165 }