github.com/mdaxf/iac@v0.0.0-20240519030858-58a061660378/health/health.go (about)

     1  package health
     2  
     3  import (
     4  	"context"
     5  	"encoding/json"
     6  	"errors"
     7  	"fmt"
     8  	"net/http"
     9  	"runtime"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/mdaxf/iac/com"
    14  	"go.opentelemetry.io/otel/attribute"
    15  	"go.opentelemetry.io/otel/codes"
    16  	"go.opentelemetry.io/otel/trace"
    17  )
    18  
    19  // Status type represents health status
    20  type Status string
    21  
    22  // Possible health statuses
    23  const (
    24  	StatusOK                 Status = "OK"
    25  	StatusPartiallyAvailable Status = "Partially Available"
    26  	StatusUnavailable        Status = "Unavailable"
    27  	StatusTimeout            Status = "Timeout during health check"
    28  )
    29  
    30  type (
    31  	// CheckFunc is the func which executes the check.
    32  	CheckFunc func(context.Context) error
    33  
    34  	// Config carries the parameters to run the check.
    35  	Config struct {
    36  		// Name is the name of the resource to be checked.
    37  		Name string
    38  		// Timeout is the timeout defined for every check.
    39  		Timeout time.Duration
    40  		// SkipOnErr if set to true, it will retrieve StatusOK providing the error message from the failed resource.
    41  		SkipOnErr bool
    42  		// Check is the func which executes the check.
    43  		Check CheckFunc
    44  	}
    45  
    46  	// Check represents the health check response.
    47  	Check struct {
    48  		// Status is the check status.
    49  		Status Status `json:"status"`
    50  		// Result is the check result.
    51  		Results map[string]interface{} `json:"results"`
    52  		// Timestamp is the time in which the check occurred.
    53  		Timestamp time.Time `json:"timestamp"`
    54  		// Failures holds the failed checks along with their messages.
    55  		Failures map[string]string `json:"failures,omitempty"`
    56  		// System holds information of the go process.
    57  		*System `json:"system,omitempty"`
    58  		// Component holds information on the component for which checks are made
    59  		Component `json:"component"`
    60  	}
    61  
    62  	// System runtime variables about the go process.
    63  	System struct {
    64  		// Version is the go version.
    65  		Version string `json:"version"`
    66  		// GoroutinesCount is the number of the current goroutines.
    67  		GoroutinesCount int `json:"goroutines_count"`
    68  		// TotalAllocBytes is the total bytes allocated.
    69  		TotalAllocBytes int `json:"total_alloc_bytes"`
    70  		// HeapObjectsCount is the number of objects in the go heap.
    71  		HeapObjectsCount int `json:"heap_objects_count"`
    72  		// TotalAllocBytes is the bytes allocated and not yet freed.
    73  		AllocBytes int `json:"alloc_bytes"`
    74  
    75  		Metrics map[string]interface{} `json:"system"`
    76  	}
    77  
    78  	// Component descriptive values about the component for which checks are made
    79  	Component struct {
    80  		// Name is the name of the component.
    81  		Name         string `json:"name"`
    82  		Instance     string `json:"instance"`
    83  		InstanceName string `json:"instancename"`
    84  		InstanceType string `json:"instancetype"`
    85  		// Version is the component version.
    86  		Version string `json:"version"`
    87  	}
    88  
    89  	// Health is the health-checks container
    90  	Health struct {
    91  		mu            sync.Mutex
    92  		checks        map[string]Config
    93  		maxConcurrent int
    94  
    95  		tp                  trace.TracerProvider
    96  		instrumentationName string
    97  
    98  		component Component
    99  
   100  		systemInfoEnabled bool
   101  	}
   102  )
   103  
   104  // New instantiates and build new health check container
   105  func New(opts ...Option) (*Health, error) {
   106  	h := &Health{
   107  		checks:        make(map[string]Config),
   108  		tp:            trace.NewNoopTracerProvider(),
   109  		maxConcurrent: runtime.NumCPU(),
   110  	}
   111  
   112  	for _, o := range opts {
   113  		if err := o(h); err != nil {
   114  			return nil, err
   115  		}
   116  	}
   117  
   118  	return h, nil
   119  }
   120  
   121  // Register registers a check config to be performed.
   122  func (h *Health) Register(c Config) error {
   123  	if c.Timeout == 0 {
   124  		c.Timeout = time.Second * 2
   125  	}
   126  
   127  	if c.Name == "" {
   128  		return errors.New("health check must have a name to be registered")
   129  	}
   130  
   131  	h.mu.Lock()
   132  	defer h.mu.Unlock()
   133  
   134  	if _, ok := h.checks[c.Name]; ok {
   135  		return fmt.Errorf("health check %q is already registered", c.Name)
   136  	}
   137  
   138  	h.checks[c.Name] = c
   139  
   140  	return nil
   141  }
   142  
   143  // Handler returns an HTTP handler (http.HandlerFunc).
   144  func (h *Health) Handler() http.Handler {
   145  
   146  	return http.HandlerFunc(h.HandlerFunc)
   147  }
   148  
   149  // HandlerFunc is the HTTP handler function.
   150  func (h *Health) HandlerFunc(w http.ResponseWriter, r *http.Request) {
   151  	c := h.Measure(r.Context())
   152  
   153  	w.Header().Set("Content-Type", "application/json")
   154  	data, err := json.Marshal(c)
   155  	if err != nil {
   156  		w.WriteHeader(http.StatusInternalServerError)
   157  		http.Error(w, err.Error(), http.StatusInternalServerError)
   158  		return
   159  	}
   160  
   161  	code := http.StatusOK
   162  	/*if c.Status == StatusUnavailable {
   163  		code = http.StatusServiceUnavailable
   164  	}  */
   165  	w.WriteHeader(code)
   166  	w.Write(data)
   167  }
   168  
   169  // Measure runs all the registered health checks and returns summary status
   170  func (h *Health) Measure(ctx context.Context) Check {
   171  	h.mu.Lock()
   172  	defer h.mu.Unlock()
   173  
   174  	tracer := h.tp.Tracer(h.instrumentationName)
   175  
   176  	ctx, span := tracer.Start(
   177  		ctx,
   178  		"health.Measure",
   179  		trace.WithAttributes(attribute.Int("checks", len(h.checks))),
   180  	)
   181  	defer span.End()
   182  
   183  	status := StatusOK
   184  	failures := make(map[string]string)
   185  
   186  	limiterCh := make(chan bool, h.maxConcurrent)
   187  	defer close(limiterCh)
   188  
   189  	var (
   190  		wg sync.WaitGroup
   191  		mu sync.Mutex
   192  	)
   193  
   194  	fmt.Println("h.checks", h.checks)
   195  	results := make(map[string]interface{})
   196  
   197  	for _, c := range h.checks {
   198  		limiterCh <- true
   199  		wg.Add(1)
   200  		result := make(map[string]interface{})
   201  		go func(c Config) {
   202  			fmt.Println("configuration:", c)
   203  			result["Name"] = c.Name
   204  			result["Timeout"] = c.Timeout
   205  			result["SkipOnErr"] = c.SkipOnErr
   206  
   207  			ctx, span := tracer.Start(ctx, c.Name)
   208  			defer func() {
   209  				span.End()
   210  				<-limiterCh
   211  				wg.Done()
   212  			}()
   213  
   214  			resCh := make(chan error)
   215  
   216  			go func() {
   217  				resCh <- c.Check(ctx)
   218  				defer close(resCh)
   219  			}()
   220  
   221  			timeout := time.NewTimer(c.Timeout)
   222  			fmt.Println("timeout:", timeout)
   223  			fmt.Println("resCh:", resCh)
   224  			select {
   225  			case <-timeout.C:
   226  				mu.Lock()
   227  				defer mu.Unlock()
   228  
   229  				span.SetStatus(codes.Error, string(StatusTimeout))
   230  
   231  				failures[c.Name] = string(StatusTimeout)
   232  				status = getAvailability(status, c.SkipOnErr)
   233  			case res := <-resCh:
   234  				if !timeout.Stop() {
   235  					<-timeout.C
   236  				}
   237  
   238  				mu.Lock()
   239  				defer mu.Unlock()
   240  
   241  				if res != nil {
   242  					span.RecordError(res)
   243  
   244  					failures[c.Name] = res.Error()
   245  					status = getAvailability(status, c.SkipOnErr)
   246  				}
   247  			}
   248  			result["status"] = status
   249  			results[c.Name] = result
   250  		}(c)
   251  	}
   252  
   253  	wg.Wait()
   254  	span.SetAttributes(attribute.String("status", string(status)))
   255  
   256  	resultstr, err := com.ConvertMapToString(results)
   257  	if err != nil {
   258  		fmt.Println(fmt.Sprintf("Failed to convert json to map: %v", err))
   259  		resultstr = fmt.Sprintf("%v", results)
   260  	}
   261  	span.SetAttributes(attribute.String("results", resultstr))
   262  
   263  	var systemMetrics *System
   264  	if h.systemInfoEnabled {
   265  		systemMetrics = newSystemMetrics()
   266  	}
   267  
   268  	return newCheck(h.component, status, results, systemMetrics, failures)
   269  }
   270  
   271  func newCheck(c Component, s Status, results map[string]interface{}, system *System, failures map[string]string) Check {
   272  	return Check{
   273  		Status:    s,
   274  		Results:   results,
   275  		Timestamp: time.Now(),
   276  		Failures:  failures,
   277  		System:    system,
   278  		Component: c,
   279  	}
   280  }
   281  
   282  func newSystemMetrics() *System {
   283  	s := runtime.MemStats{}
   284  	runtime.ReadMemStats(&s)
   285  
   286  	metrics := com.ConvertstructToMap(s)
   287  
   288  	return &System{
   289  		Version:          runtime.Version(),
   290  		GoroutinesCount:  runtime.NumGoroutine(),
   291  		TotalAllocBytes:  int(s.TotalAlloc),
   292  		HeapObjectsCount: int(s.HeapObjects),
   293  		AllocBytes:       int(s.Alloc),
   294  		Metrics:          metrics,
   295  	}
   296  }
   297  
   298  func getAvailability(s Status, skipOnErr bool) Status {
   299  	if skipOnErr && s != StatusUnavailable {
   300  		return StatusPartiallyAvailable
   301  	}
   302  
   303  	return StatusUnavailable
   304  }