github.com/MetalBlockchain/metalgo@v1.11.9/api/health/worker.go (about) 1 // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. 2 // See the file LICENSE for licensing terms. 3 4 package health 5 6 import ( 7 "context" 8 "errors" 9 "fmt" 10 "maps" 11 "slices" 12 "sync" 13 "time" 14 15 "github.com/prometheus/client_golang/prometheus" 16 "go.uber.org/zap" 17 18 "github.com/MetalBlockchain/metalgo/utils" 19 "github.com/MetalBlockchain/metalgo/utils/logging" 20 "github.com/MetalBlockchain/metalgo/utils/set" 21 ) 22 23 var ( 24 allTags = []string{AllTag} 25 26 errRestrictedTag = errors.New("restricted tag") 27 errDuplicateCheck = errors.New("duplicated check") 28 ) 29 30 type worker struct { 31 log logging.Logger 32 name string 33 failingChecks *prometheus.GaugeVec 34 checksLock sync.RWMutex 35 checks map[string]*taggedChecker 36 37 resultsLock sync.RWMutex 38 results map[string]Result 39 numFailingApplicationChecks int 40 tags map[string]set.Set[string] // tag -> set of check names 41 42 startOnce sync.Once 43 closeOnce sync.Once 44 wg sync.WaitGroup 45 closer chan struct{} 46 } 47 48 type taggedChecker struct { 49 checker Checker 50 isApplicationCheck bool 51 tags []string 52 } 53 54 func newWorker( 55 log logging.Logger, 56 name string, 57 failingChecks *prometheus.GaugeVec, 58 ) *worker { 59 // Initialize the number of failing checks to 0 for all checks 60 for _, tag := range []string{AllTag, ApplicationTag} { 61 failingChecks.With(prometheus.Labels{ 62 CheckLabel: name, 63 TagLabel: tag, 64 }).Set(0) 65 } 66 return &worker{ 67 log: log, 68 name: name, 69 failingChecks: failingChecks, 70 checks: make(map[string]*taggedChecker), 71 results: make(map[string]Result), 72 closer: make(chan struct{}), 73 tags: make(map[string]set.Set[string]), 74 } 75 } 76 77 func (w *worker) RegisterCheck(name string, check Checker, tags ...string) error { 78 // We ensure [AllTag] isn't contained in [tags] to prevent metrics from 79 // double counting. 80 if slices.Contains(tags, AllTag) { 81 return fmt.Errorf("%w: %q", errRestrictedTag, AllTag) 82 } 83 84 w.checksLock.Lock() 85 defer w.checksLock.Unlock() 86 87 if _, ok := w.checks[name]; ok { 88 return fmt.Errorf("%w: %q", errDuplicateCheck, name) 89 } 90 91 w.resultsLock.Lock() 92 defer w.resultsLock.Unlock() 93 94 // Add the check to each tag 95 for _, tag := range tags { 96 names := w.tags[tag] 97 names.Add(name) 98 w.tags[tag] = names 99 } 100 // Add the special AllTag descriptor 101 names := w.tags[AllTag] 102 names.Add(name) 103 w.tags[AllTag] = names 104 105 applicationChecks := w.tags[ApplicationTag] 106 tc := &taggedChecker{ 107 checker: check, 108 isApplicationCheck: applicationChecks.Contains(name), 109 tags: tags, 110 } 111 w.checks[name] = tc 112 w.results[name] = notYetRunResult 113 114 // Whenever a new check is added - it is failing 115 w.log.Info("registered new check and initialized its state to failing", 116 zap.String("name", w.name), 117 zap.String("name", name), 118 zap.Strings("tags", tags), 119 ) 120 121 // If this is a new application-wide check, then all of the registered tags 122 // now have one additional failing check. 123 w.updateMetrics(tc, false /*=healthy*/, true /*=register*/) 124 return nil 125 } 126 127 func (w *worker) RegisterMonotonicCheck(name string, checker Checker, tags ...string) error { 128 var result utils.Atomic[any] 129 return w.RegisterCheck(name, CheckerFunc(func(ctx context.Context) (any, error) { 130 details := result.Get() 131 if details != nil { 132 return details, nil 133 } 134 135 details, err := checker.HealthCheck(ctx) 136 if err == nil { 137 result.Set(details) 138 } 139 return details, err 140 }), tags...) 141 } 142 143 func (w *worker) Results(tags ...string) (map[string]Result, bool) { 144 w.resultsLock.RLock() 145 defer w.resultsLock.RUnlock() 146 147 // if no tags are specified, return all checks 148 if len(tags) == 0 { 149 tags = allTags 150 } 151 152 names := set.Set[string]{} 153 tagSet := set.Of(tags...) 154 tagSet.Add(ApplicationTag) // we always want to include the application tag 155 for tag := range tagSet { 156 if set, ok := w.tags[tag]; ok { 157 names.Union(set) 158 } 159 } 160 161 results := make(map[string]Result, names.Len()) 162 healthy := true 163 for name := range names { 164 if result, ok := w.results[name]; ok { 165 results[name] = result 166 healthy = healthy && result.Error == nil 167 } 168 } 169 return results, healthy 170 } 171 172 func (w *worker) Start(ctx context.Context, freq time.Duration) { 173 w.startOnce.Do(func() { 174 detachedCtx := context.WithoutCancel(ctx) 175 w.wg.Add(1) 176 go func() { 177 ticker := time.NewTicker(freq) 178 defer func() { 179 ticker.Stop() 180 w.wg.Done() 181 }() 182 183 w.runChecks(detachedCtx) 184 for { 185 select { 186 case <-ticker.C: 187 w.runChecks(detachedCtx) 188 case <-w.closer: 189 return 190 } 191 } 192 }() 193 }) 194 } 195 196 func (w *worker) Stop() { 197 w.closeOnce.Do(func() { 198 close(w.closer) 199 w.wg.Wait() 200 }) 201 } 202 203 func (w *worker) runChecks(ctx context.Context) { 204 w.checksLock.RLock() 205 // Copy the [w.checks] map to collect the checks that we will be running 206 // during this iteration. If [w.checks] is modified during this iteration of 207 // [runChecks], then the added check will not be run until the next 208 // iteration. 209 checks := maps.Clone(w.checks) 210 w.checksLock.RUnlock() 211 212 var wg sync.WaitGroup 213 wg.Add(len(checks)) 214 for name, check := range checks { 215 go w.runCheck(ctx, &wg, name, check) 216 } 217 wg.Wait() 218 } 219 220 func (w *worker) runCheck(ctx context.Context, wg *sync.WaitGroup, name string, check *taggedChecker) { 221 defer wg.Done() 222 223 start := time.Now() 224 225 // To avoid any deadlocks when [RegisterCheck] is called with a lock 226 // that is grabbed by [check.HealthCheck], we ensure that no locks 227 // are held when [check.HealthCheck] is called. 228 details, err := check.checker.HealthCheck(ctx) 229 end := time.Now() 230 231 result := Result{ 232 Details: details, 233 Timestamp: end, 234 Duration: end.Sub(start), 235 } 236 237 w.resultsLock.Lock() 238 defer w.resultsLock.Unlock() 239 prevResult := w.results[name] 240 if err != nil { 241 errString := err.Error() 242 result.Error = &errString 243 244 result.ContiguousFailures = prevResult.ContiguousFailures + 1 245 if prevResult.ContiguousFailures > 0 { 246 result.TimeOfFirstFailure = prevResult.TimeOfFirstFailure 247 } else { 248 result.TimeOfFirstFailure = &end 249 } 250 251 if prevResult.Error == nil { 252 w.log.Warn("check started failing", 253 zap.String("name", w.name), 254 zap.String("name", name), 255 zap.Strings("tags", check.tags), 256 zap.Error(err), 257 ) 258 w.updateMetrics(check, false /*=healthy*/, false /*=register*/) 259 } 260 } else if prevResult.Error != nil { 261 w.log.Info("check started passing", 262 zap.String("name", w.name), 263 zap.String("name", name), 264 zap.Strings("tags", check.tags), 265 ) 266 w.updateMetrics(check, true /*=healthy*/, false /*=register*/) 267 } 268 w.results[name] = result 269 } 270 271 // updateMetrics updates the metrics for the given check. If [healthy] is true, 272 // then the check is considered healthy and the metrics are decremented. 273 // Otherwise, the check is considered unhealthy and the metrics are incremented. 274 // [register] must be true only if this is the first time the check is being 275 // registered. 276 func (w *worker) updateMetrics(tc *taggedChecker, healthy bool, register bool) { 277 if tc.isApplicationCheck { 278 // Note: [w.tags] will include AllTag. 279 for tag := range w.tags { 280 gauge := w.failingChecks.With(prometheus.Labels{ 281 CheckLabel: w.name, 282 TagLabel: tag, 283 }) 284 if healthy { 285 gauge.Dec() 286 } else { 287 gauge.Inc() 288 } 289 } 290 if healthy { 291 w.numFailingApplicationChecks-- 292 } else { 293 w.numFailingApplicationChecks++ 294 } 295 } else { 296 for _, tag := range tc.tags { 297 gauge := w.failingChecks.With(prometheus.Labels{ 298 CheckLabel: w.name, 299 TagLabel: tag, 300 }) 301 if healthy { 302 gauge.Dec() 303 } else { 304 gauge.Inc() 305 // If this is the first time this tag was registered, we also need to 306 // account for the currently failing application-wide checks. 307 if register && w.tags[tag].Len() == 1 { 308 gauge.Add(float64(w.numFailingApplicationChecks)) 309 } 310 } 311 } 312 gauge := w.failingChecks.With(prometheus.Labels{ 313 CheckLabel: w.name, 314 TagLabel: AllTag, 315 }) 316 if healthy { 317 gauge.Dec() 318 } else { 319 gauge.Inc() 320 } 321 } 322 }