github.com/Financial-Times/publish-availability-monitor@v1.12.0/healthcheck.go (about)

     1  package main
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"net/http"
     7  	"net/url"
     8  	"strings"
     9  	"sync"
    10  	"time"
    11  
    12  	fthealth "github.com/Financial-Times/go-fthealth/v1_1"
    13  	"github.com/Financial-Times/go-logger/v2"
    14  	"github.com/Financial-Times/kafka-client-go/v4"
    15  	"github.com/Financial-Times/publish-availability-monitor/checks"
    16  	"github.com/Financial-Times/publish-availability-monitor/config"
    17  	"github.com/Financial-Times/publish-availability-monitor/envs"
    18  	"github.com/Financial-Times/publish-availability-monitor/feeds"
    19  	"github.com/Financial-Times/publish-availability-monitor/metrics"
    20  	"github.com/Financial-Times/service-status-go/gtg"
    21  )
    22  
    23  const requestTimeout = 4500
    24  
    25  // Healthcheck offers methods to measure application health.
    26  type Healthcheck struct {
    27  	client          *http.Client
    28  	config          *config.AppConfig
    29  	consumer        kafkaConsumer
    30  	metricContainer *metrics.History
    31  	environments    *envs.Environments
    32  	subscribedFeeds map[string][]feeds.Feed
    33  	log             *logger.UPPLogger
    34  }
    35  
    36  type kafkaConsumer interface {
    37  	ConnectivityCheck() error
    38  	MonitorCheck() error
    39  }
    40  
    41  func newHealthcheck(config *config.AppConfig, metricContainer *metrics.History, environments *envs.Environments, subscribedFeeds map[string][]feeds.Feed, c kafkaConsumer, log *logger.UPPLogger) *Healthcheck {
    42  	httpClient := &http.Client{Timeout: requestTimeout * time.Millisecond}
    43  	return &Healthcheck{
    44  		client:          httpClient,
    45  		config:          config,
    46  		consumer:        c,
    47  		metricContainer: metricContainer,
    48  		environments:    environments,
    49  		subscribedFeeds: subscribedFeeds,
    50  		log:             log,
    51  	}
    52  }
    53  
    54  type readEnvironmentHealthcheck struct {
    55  	env       envs.Environment
    56  	client    *http.Client
    57  	appConfig *config.AppConfig
    58  	log       *logger.UPPLogger
    59  }
    60  
    61  const pamRunbookURL = "https://runbooks.in.ft.com/publish-availability-monitor"
    62  
    63  var noReadEnvironments = fthealth.Check{
    64  	ID:               "ReadEnvironments",
    65  	BusinessImpact:   "Publish metrics are not recorded. This will impact the SLA measurement.",
    66  	Name:             "ReadEnvironments",
    67  	PanicGuide:       pamRunbookURL,
    68  	Severity:         1,
    69  	TechnicalSummary: "There are no read environments to monitor. This could be because none have been configured",
    70  	Checker: func() (string, error) {
    71  		return "", errors.New("there are no read environments to monitor")
    72  	},
    73  }
    74  
    75  func (h *Healthcheck) checkHealth() func(w http.ResponseWriter, r *http.Request) {
    76  	checks := []fthealth.Check{
    77  		h.consumerQueueReachable(),
    78  		h.reflectPublishFailures(),
    79  		h.validationServicesReachable(),
    80  		h.isConsumingFromPushFeeds(),
    81  		h.consumerMonitorCheck(),
    82  	}
    83  
    84  	readEnvironmentChecks := h.readEnvironmentsReachable()
    85  	if len(readEnvironmentChecks) == 0 {
    86  		checks = append(checks, noReadEnvironments)
    87  	} else {
    88  		checks = append(checks, readEnvironmentChecks...)
    89  	}
    90  
    91  	hc := fthealth.TimedHealthCheck{
    92  		HealthCheck: fthealth.HealthCheck{
    93  			SystemCode:  "publish-availability-monitor",
    94  			Name:        "Publish Availability Monitor",
    95  			Description: "Monitors publishes to the UPP platform and alerts on any publishing failures",
    96  			Checks:      checks,
    97  		},
    98  		Timeout: 10 * time.Second,
    99  	}
   100  
   101  	return fthealth.Handler(hc)
   102  }
   103  
   104  func (h *Healthcheck) GTG() gtg.Status {
   105  	consumerCheck := func() gtg.Status {
   106  		return gtgCheck(h.checkConsumerConnectivity)
   107  	}
   108  
   109  	validationServiceCheck := func() gtg.Status {
   110  		return gtgCheck(h.checkValidationServicesReachable)
   111  	}
   112  
   113  	return gtg.FailFastParallelCheck([]gtg.StatusChecker{
   114  		consumerCheck,
   115  		validationServiceCheck,
   116  	})()
   117  }
   118  
   119  func gtgCheck(handler func() (string, error)) gtg.Status {
   120  	if _, err := handler(); err != nil {
   121  		return gtg.Status{GoodToGo: false, Message: err.Error()}
   122  	}
   123  	return gtg.Status{GoodToGo: true}
   124  }
   125  
   126  func (h *Healthcheck) isConsumingFromPushFeeds() fthealth.Check {
   127  	return fthealth.Check{
   128  		ID:               "IsConsumingFromNotificationsPushFeeds",
   129  		BusinessImpact:   "Publish metrics are not recorded. This will impact the SLA measurement.",
   130  		Name:             "IsConsumingFromNotificationsPushFeeds",
   131  		PanicGuide:       pamRunbookURL,
   132  		Severity:         1,
   133  		TechnicalSummary: "The connections to the configured notifications-push feeds are operating correctly.",
   134  		Checker: func() (string, error) {
   135  			var failing []string
   136  			result := true
   137  			for _, val := range h.subscribedFeeds {
   138  				for _, feed := range val {
   139  					push, ok := feed.(*feeds.NotificationsPushFeed)
   140  					if ok && !push.IsConnected() {
   141  						h.log.Warnf("Feed \"%s\" with URL \"%s\" is not connected!", feed.FeedName(), feed.FeedURL())
   142  						failing = append(failing, feed.FeedURL())
   143  						result = false
   144  					}
   145  				}
   146  			}
   147  
   148  			if !result {
   149  				return "Disconnection detected.", errors.New("At least one of our Notifications Push feeds in the delivery cluster is disconnected! " +
   150  					"Please review the logs, and check delivery healthchecks. " +
   151  					"We will attempt reconnection indefinitely, but there could be an issue with the delivery cluster's notifications-push services. " +
   152  					"Failing connections: " + strings.Join(failing, ","))
   153  			}
   154  			return "", nil
   155  		},
   156  	}
   157  }
   158  
   159  func (h *Healthcheck) consumerQueueReachable() fthealth.Check {
   160  	return fthealth.Check{
   161  		ID:               "ConsumerQueueReachable",
   162  		BusinessImpact:   "Publish metrics are not recorded. This will impact the SLA measurement.",
   163  		Name:             "ConsumerQueueReachable",
   164  		PanicGuide:       pamRunbookURL,
   165  		Severity:         1,
   166  		TechnicalSummary: "Kafka consumer is not reachable/healthy",
   167  		Checker:          h.checkConsumerConnectivity,
   168  	}
   169  }
   170  
   171  func (h *Healthcheck) consumerMonitorCheck() fthealth.Check {
   172  	return fthealth.Check{
   173  		ID:               "ConsumerQueueLagging",
   174  		BusinessImpact:   "Publish metrics are slowed down. This will impact the SLA measurement.",
   175  		Name:             "ConsumerQueueLagging",
   176  		PanicGuide:       pamRunbookURL,
   177  		Severity:         2,
   178  		TechnicalSummary: kafka.LagTechnicalSummary,
   179  		Checker:          h.checkConsumerMonitor,
   180  	}
   181  }
   182  
   183  func (h *Healthcheck) reflectPublishFailures() fthealth.Check {
   184  	return fthealth.Check{
   185  		ID:               "ReflectPublishFailures",
   186  		BusinessImpact:   "At least two of the last 10 publishes failed. This will reflect in the SLA measurement.",
   187  		Name:             "ReflectPublishFailures",
   188  		PanicGuide:       pamRunbookURL,
   189  		Severity:         1,
   190  		TechnicalSummary: "Publishes did not meet the SLA measurments",
   191  		Checker:          h.checkForPublishFailures,
   192  	}
   193  }
   194  
   195  func (h *Healthcheck) checkForPublishFailures() (string, error) {
   196  	failures := h.metricContainer.GetFailures()
   197  
   198  	failureThreshold := 2 //default
   199  	if h.config.HealthConf.FailureThreshold != 0 {
   200  		failureThreshold = h.config.HealthConf.FailureThreshold
   201  	}
   202  
   203  	if len(failures) >= failureThreshold {
   204  		return "", fmt.Errorf("%d publish failures happened during the last 10 publishes", len(failures))
   205  	}
   206  	return "", nil
   207  }
   208  
   209  func (h *Healthcheck) validationServicesReachable() fthealth.Check {
   210  	return fthealth.Check{
   211  		ID:               "validationServicesReachable",
   212  		BusinessImpact:   "Publish metrics might not be correct. False positive failures might be recorded. This will impact the SLA measurement.",
   213  		Name:             "validationServicesReachable",
   214  		PanicGuide:       pamRunbookURL,
   215  		Severity:         1,
   216  		TechnicalSummary: "Validation services are not reachable/healthy",
   217  		Checker:          h.checkValidationServicesReachable,
   218  	}
   219  }
   220  
   221  func (h *Healthcheck) checkValidationServicesReachable() (string, error) {
   222  	endpoints := h.config.ValidationEndpoints
   223  	var wg sync.WaitGroup
   224  	hcErrs := make(chan error, len(endpoints))
   225  	for _, url := range endpoints {
   226  		wg.Add(1)
   227  		healthcheckURL, err := inferHealthCheckURL(url)
   228  		if err != nil {
   229  			h.log.WithError(err).Errorf("Validation Service URL: [%s].", url)
   230  			continue
   231  		}
   232  		username, password := envs.GetValidationCredentials()
   233  		go checkServiceReachable(healthcheckURL, username, password, h.client, hcErrs, &wg, h.log)
   234  	}
   235  
   236  	wg.Wait()
   237  	close(hcErrs)
   238  	for err := range hcErrs {
   239  		if err != nil {
   240  			return "", err
   241  		}
   242  	}
   243  	return "", nil
   244  }
   245  
   246  func (h *Healthcheck) checkConsumerConnectivity() (string, error) {
   247  	if err := h.consumer.ConnectivityCheck(); err != nil {
   248  		return "", err
   249  	}
   250  	return "OK", nil
   251  }
   252  
   253  func (h *Healthcheck) checkConsumerMonitor() (string, error) {
   254  	if err := h.consumer.MonitorCheck(); err != nil {
   255  		return "", err
   256  	}
   257  	return "OK", nil
   258  }
   259  
   260  func checkServiceReachable(healthcheckURL string, username string, password string, client *http.Client, hcRes chan<- error, wg *sync.WaitGroup, log *logger.UPPLogger) {
   261  	defer wg.Done()
   262  	log.Debugf("Checking: %s", healthcheckURL)
   263  
   264  	req, err := http.NewRequest("GET", healthcheckURL, nil)
   265  	if err != nil {
   266  		hcRes <- fmt.Errorf("cannot create HTTP request with URL: [%s]: [%w]", healthcheckURL, err)
   267  		return
   268  	}
   269  
   270  	if username != "" && password != "" {
   271  		req.SetBasicAuth(username, password)
   272  	}
   273  
   274  	resp, err := client.Do(req)
   275  	if err != nil {
   276  		hcRes <- fmt.Errorf("healthcheck URL: [%s]: [%w]", healthcheckURL, err)
   277  		return
   278  	}
   279  	defer resp.Body.Close()
   280  	if resp.StatusCode != 200 {
   281  		hcRes <- fmt.Errorf("unhealthy statusCode received: [%d] for URL [%s]", resp.StatusCode, healthcheckURL)
   282  		return
   283  	}
   284  	hcRes <- nil
   285  }
   286  
   287  func (h *Healthcheck) readEnvironmentsReachable() []fthealth.Check {
   288  	for i := 0; !h.environments.AreReady() && i < 5; i++ {
   289  		h.log.Info("Environments not set, retry in 2s...")
   290  		time.Sleep(2 * time.Second)
   291  	}
   292  
   293  	hc := make([]fthealth.Check, 0, h.environments.Len())
   294  
   295  	for _, envName := range h.environments.Names() {
   296  		hc = append(hc, fthealth.Check{
   297  			ID:               envName + "-readEndpointsReachable",
   298  			BusinessImpact:   "Publish metrics might not be correct. False positive failures might be recorded. This will impact the SLA measurement.",
   299  			Name:             envName + "-readEndpointsReachable",
   300  			PanicGuide:       pamRunbookURL,
   301  			Severity:         1,
   302  			TechnicalSummary: "Read services are not reachable/healthy",
   303  			Checker: (&readEnvironmentHealthcheck{
   304  				env:       h.environments.Environment(envName),
   305  				client:    h.client,
   306  				appConfig: h.config,
   307  				log:       h.log,
   308  			}).checkReadEnvironmentReachable,
   309  		})
   310  	}
   311  	return hc
   312  }
   313  
   314  func (h *readEnvironmentHealthcheck) checkReadEnvironmentReachable() (string, error) {
   315  	var wg sync.WaitGroup
   316  	hcErrs := make(chan error, len(h.appConfig.MetricConf))
   317  
   318  	for _, metric := range h.appConfig.MetricConf {
   319  		var endpointURL *url.URL
   320  		var err error
   321  		var username, password string
   322  		if checks.AbsoluteURLRegex.MatchString(metric.Endpoint) {
   323  			endpointURL, err = url.Parse(metric.Endpoint)
   324  		} else {
   325  			endpointURL, err = url.Parse(h.env.ReadURL + metric.Endpoint)
   326  			username = h.env.Username
   327  			password = h.env.Password
   328  		}
   329  
   330  		if err != nil {
   331  			h.log.WithError(err).Errorf("Cannot parse url [%v]", metric.Endpoint)
   332  			continue
   333  		}
   334  
   335  		healthcheckURL := buildFtHealthcheckURL(*endpointURL, metric.Health)
   336  
   337  		wg.Add(1)
   338  		go checkServiceReachable(healthcheckURL, username, password, h.client, hcErrs, &wg, h.log)
   339  	}
   340  
   341  	wg.Wait()
   342  	close(hcErrs)
   343  	for err := range hcErrs {
   344  		if err != nil {
   345  			return "", err
   346  		}
   347  	}
   348  	return "", nil
   349  }
   350  
   351  func inferHealthCheckURL(serviceURL string) (string, error) {
   352  	parsedURL, err := url.Parse(serviceURL)
   353  	if err != nil {
   354  		return "", err
   355  	}
   356  
   357  	var newPath string
   358  	if strings.HasPrefix(parsedURL.Path, "/__") {
   359  		newPath = strings.SplitN(parsedURL.Path[1:], "/", 2)[0] + "/__health"
   360  	} else {
   361  		newPath = "/__health"
   362  	}
   363  
   364  	parsedURL.Path = newPath
   365  	return parsedURL.String(), nil
   366  }
   367  
   368  func buildFtHealthcheckURL(endpoint url.URL, health string) string {
   369  	endpoint.Path = health
   370  	endpoint.RawQuery = "" // strip query params
   371  	return endpoint.String()
   372  }