github.com/Financial-Times/publish-availability-monitor@v1.12.0/healthcheck.go (about) 1 package main 2 3 import ( 4 "errors" 5 "fmt" 6 "net/http" 7 "net/url" 8 "strings" 9 "sync" 10 "time" 11 12 fthealth "github.com/Financial-Times/go-fthealth/v1_1" 13 "github.com/Financial-Times/go-logger/v2" 14 "github.com/Financial-Times/kafka-client-go/v4" 15 "github.com/Financial-Times/publish-availability-monitor/checks" 16 "github.com/Financial-Times/publish-availability-monitor/config" 17 "github.com/Financial-Times/publish-availability-monitor/envs" 18 "github.com/Financial-Times/publish-availability-monitor/feeds" 19 "github.com/Financial-Times/publish-availability-monitor/metrics" 20 "github.com/Financial-Times/service-status-go/gtg" 21 ) 22 23 const requestTimeout = 4500 24 25 // Healthcheck offers methods to measure application health. 26 type Healthcheck struct { 27 client *http.Client 28 config *config.AppConfig 29 consumer kafkaConsumer 30 metricContainer *metrics.History 31 environments *envs.Environments 32 subscribedFeeds map[string][]feeds.Feed 33 log *logger.UPPLogger 34 } 35 36 type kafkaConsumer interface { 37 ConnectivityCheck() error 38 MonitorCheck() error 39 } 40 41 func newHealthcheck(config *config.AppConfig, metricContainer *metrics.History, environments *envs.Environments, subscribedFeeds map[string][]feeds.Feed, c kafkaConsumer, log *logger.UPPLogger) *Healthcheck { 42 httpClient := &http.Client{Timeout: requestTimeout * time.Millisecond} 43 return &Healthcheck{ 44 client: httpClient, 45 config: config, 46 consumer: c, 47 metricContainer: metricContainer, 48 environments: environments, 49 subscribedFeeds: subscribedFeeds, 50 log: log, 51 } 52 } 53 54 type readEnvironmentHealthcheck struct { 55 env envs.Environment 56 client *http.Client 57 appConfig *config.AppConfig 58 log *logger.UPPLogger 59 } 60 61 const pamRunbookURL = "https://runbooks.in.ft.com/publish-availability-monitor" 62 63 var noReadEnvironments = fthealth.Check{ 64 ID: "ReadEnvironments", 65 BusinessImpact: "Publish metrics are not recorded. This will impact the SLA measurement.", 66 Name: "ReadEnvironments", 67 PanicGuide: pamRunbookURL, 68 Severity: 1, 69 TechnicalSummary: "There are no read environments to monitor. This could be because none have been configured", 70 Checker: func() (string, error) { 71 return "", errors.New("there are no read environments to monitor") 72 }, 73 } 74 75 func (h *Healthcheck) checkHealth() func(w http.ResponseWriter, r *http.Request) { 76 checks := []fthealth.Check{ 77 h.consumerQueueReachable(), 78 h.reflectPublishFailures(), 79 h.validationServicesReachable(), 80 h.isConsumingFromPushFeeds(), 81 h.consumerMonitorCheck(), 82 } 83 84 readEnvironmentChecks := h.readEnvironmentsReachable() 85 if len(readEnvironmentChecks) == 0 { 86 checks = append(checks, noReadEnvironments) 87 } else { 88 checks = append(checks, readEnvironmentChecks...) 89 } 90 91 hc := fthealth.TimedHealthCheck{ 92 HealthCheck: fthealth.HealthCheck{ 93 SystemCode: "publish-availability-monitor", 94 Name: "Publish Availability Monitor", 95 Description: "Monitors publishes to the UPP platform and alerts on any publishing failures", 96 Checks: checks, 97 }, 98 Timeout: 10 * time.Second, 99 } 100 101 return fthealth.Handler(hc) 102 } 103 104 func (h *Healthcheck) GTG() gtg.Status { 105 consumerCheck := func() gtg.Status { 106 return gtgCheck(h.checkConsumerConnectivity) 107 } 108 109 validationServiceCheck := func() gtg.Status { 110 return gtgCheck(h.checkValidationServicesReachable) 111 } 112 113 return gtg.FailFastParallelCheck([]gtg.StatusChecker{ 114 consumerCheck, 115 validationServiceCheck, 116 })() 117 } 118 119 func gtgCheck(handler func() (string, error)) gtg.Status { 120 if _, err := handler(); err != nil { 121 return gtg.Status{GoodToGo: false, Message: err.Error()} 122 } 123 return gtg.Status{GoodToGo: true} 124 } 125 126 func (h *Healthcheck) isConsumingFromPushFeeds() fthealth.Check { 127 return fthealth.Check{ 128 ID: "IsConsumingFromNotificationsPushFeeds", 129 BusinessImpact: "Publish metrics are not recorded. This will impact the SLA measurement.", 130 Name: "IsConsumingFromNotificationsPushFeeds", 131 PanicGuide: pamRunbookURL, 132 Severity: 1, 133 TechnicalSummary: "The connections to the configured notifications-push feeds are operating correctly.", 134 Checker: func() (string, error) { 135 var failing []string 136 result := true 137 for _, val := range h.subscribedFeeds { 138 for _, feed := range val { 139 push, ok := feed.(*feeds.NotificationsPushFeed) 140 if ok && !push.IsConnected() { 141 h.log.Warnf("Feed \"%s\" with URL \"%s\" is not connected!", feed.FeedName(), feed.FeedURL()) 142 failing = append(failing, feed.FeedURL()) 143 result = false 144 } 145 } 146 } 147 148 if !result { 149 return "Disconnection detected.", errors.New("At least one of our Notifications Push feeds in the delivery cluster is disconnected! " + 150 "Please review the logs, and check delivery healthchecks. " + 151 "We will attempt reconnection indefinitely, but there could be an issue with the delivery cluster's notifications-push services. " + 152 "Failing connections: " + strings.Join(failing, ",")) 153 } 154 return "", nil 155 }, 156 } 157 } 158 159 func (h *Healthcheck) consumerQueueReachable() fthealth.Check { 160 return fthealth.Check{ 161 ID: "ConsumerQueueReachable", 162 BusinessImpact: "Publish metrics are not recorded. This will impact the SLA measurement.", 163 Name: "ConsumerQueueReachable", 164 PanicGuide: pamRunbookURL, 165 Severity: 1, 166 TechnicalSummary: "Kafka consumer is not reachable/healthy", 167 Checker: h.checkConsumerConnectivity, 168 } 169 } 170 171 func (h *Healthcheck) consumerMonitorCheck() fthealth.Check { 172 return fthealth.Check{ 173 ID: "ConsumerQueueLagging", 174 BusinessImpact: "Publish metrics are slowed down. This will impact the SLA measurement.", 175 Name: "ConsumerQueueLagging", 176 PanicGuide: pamRunbookURL, 177 Severity: 2, 178 TechnicalSummary: kafka.LagTechnicalSummary, 179 Checker: h.checkConsumerMonitor, 180 } 181 } 182 183 func (h *Healthcheck) reflectPublishFailures() fthealth.Check { 184 return fthealth.Check{ 185 ID: "ReflectPublishFailures", 186 BusinessImpact: "At least two of the last 10 publishes failed. This will reflect in the SLA measurement.", 187 Name: "ReflectPublishFailures", 188 PanicGuide: pamRunbookURL, 189 Severity: 1, 190 TechnicalSummary: "Publishes did not meet the SLA measurments", 191 Checker: h.checkForPublishFailures, 192 } 193 } 194 195 func (h *Healthcheck) checkForPublishFailures() (string, error) { 196 failures := h.metricContainer.GetFailures() 197 198 failureThreshold := 2 //default 199 if h.config.HealthConf.FailureThreshold != 0 { 200 failureThreshold = h.config.HealthConf.FailureThreshold 201 } 202 203 if len(failures) >= failureThreshold { 204 return "", fmt.Errorf("%d publish failures happened during the last 10 publishes", len(failures)) 205 } 206 return "", nil 207 } 208 209 func (h *Healthcheck) validationServicesReachable() fthealth.Check { 210 return fthealth.Check{ 211 ID: "validationServicesReachable", 212 BusinessImpact: "Publish metrics might not be correct. False positive failures might be recorded. This will impact the SLA measurement.", 213 Name: "validationServicesReachable", 214 PanicGuide: pamRunbookURL, 215 Severity: 1, 216 TechnicalSummary: "Validation services are not reachable/healthy", 217 Checker: h.checkValidationServicesReachable, 218 } 219 } 220 221 func (h *Healthcheck) checkValidationServicesReachable() (string, error) { 222 endpoints := h.config.ValidationEndpoints 223 var wg sync.WaitGroup 224 hcErrs := make(chan error, len(endpoints)) 225 for _, url := range endpoints { 226 wg.Add(1) 227 healthcheckURL, err := inferHealthCheckURL(url) 228 if err != nil { 229 h.log.WithError(err).Errorf("Validation Service URL: [%s].", url) 230 continue 231 } 232 username, password := envs.GetValidationCredentials() 233 go checkServiceReachable(healthcheckURL, username, password, h.client, hcErrs, &wg, h.log) 234 } 235 236 wg.Wait() 237 close(hcErrs) 238 for err := range hcErrs { 239 if err != nil { 240 return "", err 241 } 242 } 243 return "", nil 244 } 245 246 func (h *Healthcheck) checkConsumerConnectivity() (string, error) { 247 if err := h.consumer.ConnectivityCheck(); err != nil { 248 return "", err 249 } 250 return "OK", nil 251 } 252 253 func (h *Healthcheck) checkConsumerMonitor() (string, error) { 254 if err := h.consumer.MonitorCheck(); err != nil { 255 return "", err 256 } 257 return "OK", nil 258 } 259 260 func checkServiceReachable(healthcheckURL string, username string, password string, client *http.Client, hcRes chan<- error, wg *sync.WaitGroup, log *logger.UPPLogger) { 261 defer wg.Done() 262 log.Debugf("Checking: %s", healthcheckURL) 263 264 req, err := http.NewRequest("GET", healthcheckURL, nil) 265 if err != nil { 266 hcRes <- fmt.Errorf("cannot create HTTP request with URL: [%s]: [%w]", healthcheckURL, err) 267 return 268 } 269 270 if username != "" && password != "" { 271 req.SetBasicAuth(username, password) 272 } 273 274 resp, err := client.Do(req) 275 if err != nil { 276 hcRes <- fmt.Errorf("healthcheck URL: [%s]: [%w]", healthcheckURL, err) 277 return 278 } 279 defer resp.Body.Close() 280 if resp.StatusCode != 200 { 281 hcRes <- fmt.Errorf("unhealthy statusCode received: [%d] for URL [%s]", resp.StatusCode, healthcheckURL) 282 return 283 } 284 hcRes <- nil 285 } 286 287 func (h *Healthcheck) readEnvironmentsReachable() []fthealth.Check { 288 for i := 0; !h.environments.AreReady() && i < 5; i++ { 289 h.log.Info("Environments not set, retry in 2s...") 290 time.Sleep(2 * time.Second) 291 } 292 293 hc := make([]fthealth.Check, 0, h.environments.Len()) 294 295 for _, envName := range h.environments.Names() { 296 hc = append(hc, fthealth.Check{ 297 ID: envName + "-readEndpointsReachable", 298 BusinessImpact: "Publish metrics might not be correct. False positive failures might be recorded. This will impact the SLA measurement.", 299 Name: envName + "-readEndpointsReachable", 300 PanicGuide: pamRunbookURL, 301 Severity: 1, 302 TechnicalSummary: "Read services are not reachable/healthy", 303 Checker: (&readEnvironmentHealthcheck{ 304 env: h.environments.Environment(envName), 305 client: h.client, 306 appConfig: h.config, 307 log: h.log, 308 }).checkReadEnvironmentReachable, 309 }) 310 } 311 return hc 312 } 313 314 func (h *readEnvironmentHealthcheck) checkReadEnvironmentReachable() (string, error) { 315 var wg sync.WaitGroup 316 hcErrs := make(chan error, len(h.appConfig.MetricConf)) 317 318 for _, metric := range h.appConfig.MetricConf { 319 var endpointURL *url.URL 320 var err error 321 var username, password string 322 if checks.AbsoluteURLRegex.MatchString(metric.Endpoint) { 323 endpointURL, err = url.Parse(metric.Endpoint) 324 } else { 325 endpointURL, err = url.Parse(h.env.ReadURL + metric.Endpoint) 326 username = h.env.Username 327 password = h.env.Password 328 } 329 330 if err != nil { 331 h.log.WithError(err).Errorf("Cannot parse url [%v]", metric.Endpoint) 332 continue 333 } 334 335 healthcheckURL := buildFtHealthcheckURL(*endpointURL, metric.Health) 336 337 wg.Add(1) 338 go checkServiceReachable(healthcheckURL, username, password, h.client, hcErrs, &wg, h.log) 339 } 340 341 wg.Wait() 342 close(hcErrs) 343 for err := range hcErrs { 344 if err != nil { 345 return "", err 346 } 347 } 348 return "", nil 349 } 350 351 func inferHealthCheckURL(serviceURL string) (string, error) { 352 parsedURL, err := url.Parse(serviceURL) 353 if err != nil { 354 return "", err 355 } 356 357 var newPath string 358 if strings.HasPrefix(parsedURL.Path, "/__") { 359 newPath = strings.SplitN(parsedURL.Path[1:], "/", 2)[0] + "/__health" 360 } else { 361 newPath = "/__health" 362 } 363 364 parsedURL.Path = newPath 365 return parsedURL.String(), nil 366 } 367 368 func buildFtHealthcheckURL(endpoint url.URL, health string) string { 369 endpoint.Path = health 370 endpoint.RawQuery = "" // strip query params 371 return endpoint.String() 372 }