github.com/mdaxf/iac@v0.0.0-20240519030858-58a061660378/health/health.go (about) 1 package health 2 3 import ( 4 "context" 5 "encoding/json" 6 "errors" 7 "fmt" 8 "net/http" 9 "runtime" 10 "sync" 11 "time" 12 13 "github.com/mdaxf/iac/com" 14 "go.opentelemetry.io/otel/attribute" 15 "go.opentelemetry.io/otel/codes" 16 "go.opentelemetry.io/otel/trace" 17 ) 18 19 // Status type represents health status 20 type Status string 21 22 // Possible health statuses 23 const ( 24 StatusOK Status = "OK" 25 StatusPartiallyAvailable Status = "Partially Available" 26 StatusUnavailable Status = "Unavailable" 27 StatusTimeout Status = "Timeout during health check" 28 ) 29 30 type ( 31 // CheckFunc is the func which executes the check. 32 CheckFunc func(context.Context) error 33 34 // Config carries the parameters to run the check. 35 Config struct { 36 // Name is the name of the resource to be checked. 37 Name string 38 // Timeout is the timeout defined for every check. 39 Timeout time.Duration 40 // SkipOnErr if set to true, it will retrieve StatusOK providing the error message from the failed resource. 41 SkipOnErr bool 42 // Check is the func which executes the check. 43 Check CheckFunc 44 } 45 46 // Check represents the health check response. 47 Check struct { 48 // Status is the check status. 49 Status Status `json:"status"` 50 // Result is the check result. 51 Results map[string]interface{} `json:"results"` 52 // Timestamp is the time in which the check occurred. 53 Timestamp time.Time `json:"timestamp"` 54 // Failures holds the failed checks along with their messages. 55 Failures map[string]string `json:"failures,omitempty"` 56 // System holds information of the go process. 57 *System `json:"system,omitempty"` 58 // Component holds information on the component for which checks are made 59 Component `json:"component"` 60 } 61 62 // System runtime variables about the go process. 63 System struct { 64 // Version is the go version. 65 Version string `json:"version"` 66 // GoroutinesCount is the number of the current goroutines. 67 GoroutinesCount int `json:"goroutines_count"` 68 // TotalAllocBytes is the total bytes allocated. 69 TotalAllocBytes int `json:"total_alloc_bytes"` 70 // HeapObjectsCount is the number of objects in the go heap. 71 HeapObjectsCount int `json:"heap_objects_count"` 72 // TotalAllocBytes is the bytes allocated and not yet freed. 73 AllocBytes int `json:"alloc_bytes"` 74 75 Metrics map[string]interface{} `json:"system"` 76 } 77 78 // Component descriptive values about the component for which checks are made 79 Component struct { 80 // Name is the name of the component. 81 Name string `json:"name"` 82 Instance string `json:"instance"` 83 InstanceName string `json:"instancename"` 84 InstanceType string `json:"instancetype"` 85 // Version is the component version. 86 Version string `json:"version"` 87 } 88 89 // Health is the health-checks container 90 Health struct { 91 mu sync.Mutex 92 checks map[string]Config 93 maxConcurrent int 94 95 tp trace.TracerProvider 96 instrumentationName string 97 98 component Component 99 100 systemInfoEnabled bool 101 } 102 ) 103 104 // New instantiates and build new health check container 105 func New(opts ...Option) (*Health, error) { 106 h := &Health{ 107 checks: make(map[string]Config), 108 tp: trace.NewNoopTracerProvider(), 109 maxConcurrent: runtime.NumCPU(), 110 } 111 112 for _, o := range opts { 113 if err := o(h); err != nil { 114 return nil, err 115 } 116 } 117 118 return h, nil 119 } 120 121 // Register registers a check config to be performed. 122 func (h *Health) Register(c Config) error { 123 if c.Timeout == 0 { 124 c.Timeout = time.Second * 2 125 } 126 127 if c.Name == "" { 128 return errors.New("health check must have a name to be registered") 129 } 130 131 h.mu.Lock() 132 defer h.mu.Unlock() 133 134 if _, ok := h.checks[c.Name]; ok { 135 return fmt.Errorf("health check %q is already registered", c.Name) 136 } 137 138 h.checks[c.Name] = c 139 140 return nil 141 } 142 143 // Handler returns an HTTP handler (http.HandlerFunc). 144 func (h *Health) Handler() http.Handler { 145 146 return http.HandlerFunc(h.HandlerFunc) 147 } 148 149 // HandlerFunc is the HTTP handler function. 150 func (h *Health) HandlerFunc(w http.ResponseWriter, r *http.Request) { 151 c := h.Measure(r.Context()) 152 153 w.Header().Set("Content-Type", "application/json") 154 data, err := json.Marshal(c) 155 if err != nil { 156 w.WriteHeader(http.StatusInternalServerError) 157 http.Error(w, err.Error(), http.StatusInternalServerError) 158 return 159 } 160 161 code := http.StatusOK 162 /*if c.Status == StatusUnavailable { 163 code = http.StatusServiceUnavailable 164 } */ 165 w.WriteHeader(code) 166 w.Write(data) 167 } 168 169 // Measure runs all the registered health checks and returns summary status 170 func (h *Health) Measure(ctx context.Context) Check { 171 h.mu.Lock() 172 defer h.mu.Unlock() 173 174 tracer := h.tp.Tracer(h.instrumentationName) 175 176 ctx, span := tracer.Start( 177 ctx, 178 "health.Measure", 179 trace.WithAttributes(attribute.Int("checks", len(h.checks))), 180 ) 181 defer span.End() 182 183 status := StatusOK 184 failures := make(map[string]string) 185 186 limiterCh := make(chan bool, h.maxConcurrent) 187 defer close(limiterCh) 188 189 var ( 190 wg sync.WaitGroup 191 mu sync.Mutex 192 ) 193 194 fmt.Println("h.checks", h.checks) 195 results := make(map[string]interface{}) 196 197 for _, c := range h.checks { 198 limiterCh <- true 199 wg.Add(1) 200 result := make(map[string]interface{}) 201 go func(c Config) { 202 fmt.Println("configuration:", c) 203 result["Name"] = c.Name 204 result["Timeout"] = c.Timeout 205 result["SkipOnErr"] = c.SkipOnErr 206 207 ctx, span := tracer.Start(ctx, c.Name) 208 defer func() { 209 span.End() 210 <-limiterCh 211 wg.Done() 212 }() 213 214 resCh := make(chan error) 215 216 go func() { 217 resCh <- c.Check(ctx) 218 defer close(resCh) 219 }() 220 221 timeout := time.NewTimer(c.Timeout) 222 fmt.Println("timeout:", timeout) 223 fmt.Println("resCh:", resCh) 224 select { 225 case <-timeout.C: 226 mu.Lock() 227 defer mu.Unlock() 228 229 span.SetStatus(codes.Error, string(StatusTimeout)) 230 231 failures[c.Name] = string(StatusTimeout) 232 status = getAvailability(status, c.SkipOnErr) 233 case res := <-resCh: 234 if !timeout.Stop() { 235 <-timeout.C 236 } 237 238 mu.Lock() 239 defer mu.Unlock() 240 241 if res != nil { 242 span.RecordError(res) 243 244 failures[c.Name] = res.Error() 245 status = getAvailability(status, c.SkipOnErr) 246 } 247 } 248 result["status"] = status 249 results[c.Name] = result 250 }(c) 251 } 252 253 wg.Wait() 254 span.SetAttributes(attribute.String("status", string(status))) 255 256 resultstr, err := com.ConvertMapToString(results) 257 if err != nil { 258 fmt.Println(fmt.Sprintf("Failed to convert json to map: %v", err)) 259 resultstr = fmt.Sprintf("%v", results) 260 } 261 span.SetAttributes(attribute.String("results", resultstr)) 262 263 var systemMetrics *System 264 if h.systemInfoEnabled { 265 systemMetrics = newSystemMetrics() 266 } 267 268 return newCheck(h.component, status, results, systemMetrics, failures) 269 } 270 271 func newCheck(c Component, s Status, results map[string]interface{}, system *System, failures map[string]string) Check { 272 return Check{ 273 Status: s, 274 Results: results, 275 Timestamp: time.Now(), 276 Failures: failures, 277 System: system, 278 Component: c, 279 } 280 } 281 282 func newSystemMetrics() *System { 283 s := runtime.MemStats{} 284 runtime.ReadMemStats(&s) 285 286 metrics := com.ConvertstructToMap(s) 287 288 return &System{ 289 Version: runtime.Version(), 290 GoroutinesCount: runtime.NumGoroutine(), 291 TotalAllocBytes: int(s.TotalAlloc), 292 HeapObjectsCount: int(s.HeapObjects), 293 AllocBytes: int(s.Alloc), 294 Metrics: metrics, 295 } 296 } 297 298 func getAvailability(s Status, skipOnErr bool) Status { 299 if skipOnErr && s != StatusUnavailable { 300 return StatusPartiallyAvailable 301 } 302 303 return StatusUnavailable 304 }