github.com/MetalBlockchain/metalgo@v1.11.9/api/health/worker.go (about)

     1  // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
     2  // See the file LICENSE for licensing terms.
     3  
     4  package health
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  	"maps"
    11  	"slices"
    12  	"sync"
    13  	"time"
    14  
    15  	"github.com/prometheus/client_golang/prometheus"
    16  	"go.uber.org/zap"
    17  
    18  	"github.com/MetalBlockchain/metalgo/utils"
    19  	"github.com/MetalBlockchain/metalgo/utils/logging"
    20  	"github.com/MetalBlockchain/metalgo/utils/set"
    21  )
    22  
    23  var (
    24  	allTags = []string{AllTag}
    25  
    26  	errRestrictedTag  = errors.New("restricted tag")
    27  	errDuplicateCheck = errors.New("duplicated check")
    28  )
    29  
    30  type worker struct {
    31  	log           logging.Logger
    32  	name          string
    33  	failingChecks *prometheus.GaugeVec
    34  	checksLock    sync.RWMutex
    35  	checks        map[string]*taggedChecker
    36  
    37  	resultsLock                 sync.RWMutex
    38  	results                     map[string]Result
    39  	numFailingApplicationChecks int
    40  	tags                        map[string]set.Set[string] // tag -> set of check names
    41  
    42  	startOnce sync.Once
    43  	closeOnce sync.Once
    44  	wg        sync.WaitGroup
    45  	closer    chan struct{}
    46  }
    47  
    48  type taggedChecker struct {
    49  	checker            Checker
    50  	isApplicationCheck bool
    51  	tags               []string
    52  }
    53  
    54  func newWorker(
    55  	log logging.Logger,
    56  	name string,
    57  	failingChecks *prometheus.GaugeVec,
    58  ) *worker {
    59  	// Initialize the number of failing checks to 0 for all checks
    60  	for _, tag := range []string{AllTag, ApplicationTag} {
    61  		failingChecks.With(prometheus.Labels{
    62  			CheckLabel: name,
    63  			TagLabel:   tag,
    64  		}).Set(0)
    65  	}
    66  	return &worker{
    67  		log:           log,
    68  		name:          name,
    69  		failingChecks: failingChecks,
    70  		checks:        make(map[string]*taggedChecker),
    71  		results:       make(map[string]Result),
    72  		closer:        make(chan struct{}),
    73  		tags:          make(map[string]set.Set[string]),
    74  	}
    75  }
    76  
    77  func (w *worker) RegisterCheck(name string, check Checker, tags ...string) error {
    78  	// We ensure [AllTag] isn't contained in [tags] to prevent metrics from
    79  	// double counting.
    80  	if slices.Contains(tags, AllTag) {
    81  		return fmt.Errorf("%w: %q", errRestrictedTag, AllTag)
    82  	}
    83  
    84  	w.checksLock.Lock()
    85  	defer w.checksLock.Unlock()
    86  
    87  	if _, ok := w.checks[name]; ok {
    88  		return fmt.Errorf("%w: %q", errDuplicateCheck, name)
    89  	}
    90  
    91  	w.resultsLock.Lock()
    92  	defer w.resultsLock.Unlock()
    93  
    94  	// Add the check to each tag
    95  	for _, tag := range tags {
    96  		names := w.tags[tag]
    97  		names.Add(name)
    98  		w.tags[tag] = names
    99  	}
   100  	// Add the special AllTag descriptor
   101  	names := w.tags[AllTag]
   102  	names.Add(name)
   103  	w.tags[AllTag] = names
   104  
   105  	applicationChecks := w.tags[ApplicationTag]
   106  	tc := &taggedChecker{
   107  		checker:            check,
   108  		isApplicationCheck: applicationChecks.Contains(name),
   109  		tags:               tags,
   110  	}
   111  	w.checks[name] = tc
   112  	w.results[name] = notYetRunResult
   113  
   114  	// Whenever a new check is added - it is failing
   115  	w.log.Info("registered new check and initialized its state to failing",
   116  		zap.String("name", w.name),
   117  		zap.String("name", name),
   118  		zap.Strings("tags", tags),
   119  	)
   120  
   121  	// If this is a new application-wide check, then all of the registered tags
   122  	// now have one additional failing check.
   123  	w.updateMetrics(tc, false /*=healthy*/, true /*=register*/)
   124  	return nil
   125  }
   126  
   127  func (w *worker) RegisterMonotonicCheck(name string, checker Checker, tags ...string) error {
   128  	var result utils.Atomic[any]
   129  	return w.RegisterCheck(name, CheckerFunc(func(ctx context.Context) (any, error) {
   130  		details := result.Get()
   131  		if details != nil {
   132  			return details, nil
   133  		}
   134  
   135  		details, err := checker.HealthCheck(ctx)
   136  		if err == nil {
   137  			result.Set(details)
   138  		}
   139  		return details, err
   140  	}), tags...)
   141  }
   142  
   143  func (w *worker) Results(tags ...string) (map[string]Result, bool) {
   144  	w.resultsLock.RLock()
   145  	defer w.resultsLock.RUnlock()
   146  
   147  	// if no tags are specified, return all checks
   148  	if len(tags) == 0 {
   149  		tags = allTags
   150  	}
   151  
   152  	names := set.Set[string]{}
   153  	tagSet := set.Of(tags...)
   154  	tagSet.Add(ApplicationTag) // we always want to include the application tag
   155  	for tag := range tagSet {
   156  		if set, ok := w.tags[tag]; ok {
   157  			names.Union(set)
   158  		}
   159  	}
   160  
   161  	results := make(map[string]Result, names.Len())
   162  	healthy := true
   163  	for name := range names {
   164  		if result, ok := w.results[name]; ok {
   165  			results[name] = result
   166  			healthy = healthy && result.Error == nil
   167  		}
   168  	}
   169  	return results, healthy
   170  }
   171  
   172  func (w *worker) Start(ctx context.Context, freq time.Duration) {
   173  	w.startOnce.Do(func() {
   174  		detachedCtx := context.WithoutCancel(ctx)
   175  		w.wg.Add(1)
   176  		go func() {
   177  			ticker := time.NewTicker(freq)
   178  			defer func() {
   179  				ticker.Stop()
   180  				w.wg.Done()
   181  			}()
   182  
   183  			w.runChecks(detachedCtx)
   184  			for {
   185  				select {
   186  				case <-ticker.C:
   187  					w.runChecks(detachedCtx)
   188  				case <-w.closer:
   189  					return
   190  				}
   191  			}
   192  		}()
   193  	})
   194  }
   195  
   196  func (w *worker) Stop() {
   197  	w.closeOnce.Do(func() {
   198  		close(w.closer)
   199  		w.wg.Wait()
   200  	})
   201  }
   202  
   203  func (w *worker) runChecks(ctx context.Context) {
   204  	w.checksLock.RLock()
   205  	// Copy the [w.checks] map to collect the checks that we will be running
   206  	// during this iteration. If [w.checks] is modified during this iteration of
   207  	// [runChecks], then the added check will not be run until the next
   208  	// iteration.
   209  	checks := maps.Clone(w.checks)
   210  	w.checksLock.RUnlock()
   211  
   212  	var wg sync.WaitGroup
   213  	wg.Add(len(checks))
   214  	for name, check := range checks {
   215  		go w.runCheck(ctx, &wg, name, check)
   216  	}
   217  	wg.Wait()
   218  }
   219  
   220  func (w *worker) runCheck(ctx context.Context, wg *sync.WaitGroup, name string, check *taggedChecker) {
   221  	defer wg.Done()
   222  
   223  	start := time.Now()
   224  
   225  	// To avoid any deadlocks when [RegisterCheck] is called with a lock
   226  	// that is grabbed by [check.HealthCheck], we ensure that no locks
   227  	// are held when [check.HealthCheck] is called.
   228  	details, err := check.checker.HealthCheck(ctx)
   229  	end := time.Now()
   230  
   231  	result := Result{
   232  		Details:   details,
   233  		Timestamp: end,
   234  		Duration:  end.Sub(start),
   235  	}
   236  
   237  	w.resultsLock.Lock()
   238  	defer w.resultsLock.Unlock()
   239  	prevResult := w.results[name]
   240  	if err != nil {
   241  		errString := err.Error()
   242  		result.Error = &errString
   243  
   244  		result.ContiguousFailures = prevResult.ContiguousFailures + 1
   245  		if prevResult.ContiguousFailures > 0 {
   246  			result.TimeOfFirstFailure = prevResult.TimeOfFirstFailure
   247  		} else {
   248  			result.TimeOfFirstFailure = &end
   249  		}
   250  
   251  		if prevResult.Error == nil {
   252  			w.log.Warn("check started failing",
   253  				zap.String("name", w.name),
   254  				zap.String("name", name),
   255  				zap.Strings("tags", check.tags),
   256  				zap.Error(err),
   257  			)
   258  			w.updateMetrics(check, false /*=healthy*/, false /*=register*/)
   259  		}
   260  	} else if prevResult.Error != nil {
   261  		w.log.Info("check started passing",
   262  			zap.String("name", w.name),
   263  			zap.String("name", name),
   264  			zap.Strings("tags", check.tags),
   265  		)
   266  		w.updateMetrics(check, true /*=healthy*/, false /*=register*/)
   267  	}
   268  	w.results[name] = result
   269  }
   270  
   271  // updateMetrics updates the metrics for the given check. If [healthy] is true,
   272  // then the check is considered healthy and the metrics are decremented.
   273  // Otherwise, the check is considered unhealthy and the metrics are incremented.
   274  // [register] must be true only if this is the first time the check is being
   275  // registered.
   276  func (w *worker) updateMetrics(tc *taggedChecker, healthy bool, register bool) {
   277  	if tc.isApplicationCheck {
   278  		// Note: [w.tags] will include AllTag.
   279  		for tag := range w.tags {
   280  			gauge := w.failingChecks.With(prometheus.Labels{
   281  				CheckLabel: w.name,
   282  				TagLabel:   tag,
   283  			})
   284  			if healthy {
   285  				gauge.Dec()
   286  			} else {
   287  				gauge.Inc()
   288  			}
   289  		}
   290  		if healthy {
   291  			w.numFailingApplicationChecks--
   292  		} else {
   293  			w.numFailingApplicationChecks++
   294  		}
   295  	} else {
   296  		for _, tag := range tc.tags {
   297  			gauge := w.failingChecks.With(prometheus.Labels{
   298  				CheckLabel: w.name,
   299  				TagLabel:   tag,
   300  			})
   301  			if healthy {
   302  				gauge.Dec()
   303  			} else {
   304  				gauge.Inc()
   305  				// If this is the first time this tag was registered, we also need to
   306  				// account for the currently failing application-wide checks.
   307  				if register && w.tags[tag].Len() == 1 {
   308  					gauge.Add(float64(w.numFailingApplicationChecks))
   309  				}
   310  			}
   311  		}
   312  		gauge := w.failingChecks.With(prometheus.Labels{
   313  			CheckLabel: w.name,
   314  			TagLabel:   AllTag,
   315  		})
   316  		if healthy {
   317  			gauge.Dec()
   318  		} else {
   319  			gauge.Inc()
   320  		}
   321  	}
   322  }