github.com/m-lab/locate@v0.17.6/heartbeat/heartbeat.go (about)

     1  package heartbeat
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"log"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/gomodule/redigo/redis"
    11  	"github.com/m-lab/go/host"
    12  	v2 "github.com/m-lab/locate/api/v2"
    13  	"github.com/m-lab/locate/memorystore"
    14  	"github.com/m-lab/locate/metrics"
    15  	"github.com/m-lab/locate/static"
    16  )
    17  
    18  var (
    19  	errInvalidArgument = errors.New("argument is invalid")
    20  	errPrometheus      = errors.New("error saving Prometheus entry")
    21  )
    22  
    23  type heartbeatStatusTracker struct {
    24  	MemorystoreClient[v2.HeartbeatMessage]
    25  	instances  map[string]v2.HeartbeatMessage
    26  	mu         sync.RWMutex
    27  	stop       chan bool
    28  	lastUpdate time.Time
    29  }
    30  
    31  // MemorystoreClient is a client for reading and writing data in Memorystore.
    32  // The interface takes in a type argument which specifies the types of values
    33  // that are stored and can be retrived.
    34  type MemorystoreClient[V any] interface {
    35  	Put(key string, field string, value redis.Scanner, opts *memorystore.PutOptions) error
    36  	GetAll() (map[string]V, error)
    37  }
    38  
    39  // NewHeartbeatStatusTracker returns a new StatusTracker implementation that uses
    40  // a Memorystore client to cache (and later import) instance data from the Heartbeat Service.
    41  // StopImport() must be called to release resources.
    42  func NewHeartbeatStatusTracker(client MemorystoreClient[v2.HeartbeatMessage]) *heartbeatStatusTracker {
    43  	h := &heartbeatStatusTracker{
    44  		MemorystoreClient: client,
    45  		instances:         make(map[string]v2.HeartbeatMessage),
    46  		stop:              make(chan bool),
    47  	}
    48  
    49  	// Start import loop.
    50  	go func(h *heartbeatStatusTracker) {
    51  		ticker := *time.NewTicker(static.MemorystoreExportPeriod)
    52  		defer ticker.Stop()
    53  
    54  		for {
    55  			select {
    56  			case <-h.stop:
    57  				return
    58  			case <-ticker.C:
    59  				h.importMemorystore()
    60  			}
    61  		}
    62  	}(h)
    63  
    64  	return h
    65  }
    66  
    67  // RegisterInstance adds a new v2.Registration message to the Memorystore client and keeps it
    68  // locally.
    69  func (h *heartbeatStatusTracker) RegisterInstance(rm v2.Registration) error {
    70  	hostname := rm.Hostname
    71  	opts := &memorystore.PutOptions{WithExpire: true}
    72  	if err := h.Put(hostname, "Registration", &rm, opts); err != nil {
    73  		return fmt.Errorf("%w: failed to write Registration message to Memorystore", err)
    74  	}
    75  
    76  	h.registerInstance(hostname, rm)
    77  	return nil
    78  }
    79  
    80  // UpdateHealth updates the v2.Health field for the instance in the Memorystore client and
    81  // updates it locally.
    82  func (h *heartbeatStatusTracker) UpdateHealth(hostname string, hm v2.Health) error {
    83  	opts := &memorystore.PutOptions{FieldMustExist: "Registration", WithExpire: true}
    84  	if err := h.Put(hostname, "Health", &hm, opts); err != nil {
    85  		return fmt.Errorf("%w: failed to write Health message to Memorystore", err)
    86  	}
    87  	return h.updateHealth(hostname, hm)
    88  }
    89  
    90  // UpdatePrometheus updates the v2.Prometheus field for the instances.
    91  func (h *heartbeatStatusTracker) UpdatePrometheus(hostnames, machines map[string]bool) error {
    92  	var err error
    93  	h.mu.Lock()
    94  	defer h.mu.Unlock()
    95  
    96  	for _, instance := range h.instances {
    97  		pm := constructPrometheusMessage(instance, hostnames, machines)
    98  		if pm != nil {
    99  			updateErr := h.updatePrometheusMessage(instance, pm)
   100  
   101  			if updateErr != nil {
   102  				log.Printf("Failed to write Prometheus message for instance %s to Memorystore: %v", instance.Registration.Hostname, updateErr)
   103  				err = errPrometheus
   104  			}
   105  		}
   106  	}
   107  
   108  	return err
   109  }
   110  
   111  // Instances returns a mapping of all the v2.HeartbeatMessage instance keys to
   112  // their values.
   113  func (h *heartbeatStatusTracker) Instances() map[string]v2.HeartbeatMessage {
   114  	h.mu.RLock()
   115  	defer h.mu.RUnlock()
   116  
   117  	c := make(map[string]v2.HeartbeatMessage, len(h.instances))
   118  	for k, v := range h.instances {
   119  		c[k] = v
   120  	}
   121  
   122  	return c
   123  }
   124  
   125  // Ready reports whether the import to Memorystore has complete successfully
   126  // within 2x the export period.
   127  func (h *heartbeatStatusTracker) Ready() bool {
   128  	h.mu.RLock()
   129  	defer h.mu.RUnlock()
   130  	return time.Since(h.lastUpdate) <= 2*static.MemorystoreExportPeriod
   131  }
   132  
   133  // StopImport stops importing instance data from the Memorystore.
   134  // It must be called to release resources.
   135  func (h *heartbeatStatusTracker) StopImport() {
   136  	h.stop <- true
   137  }
   138  
   139  func (h *heartbeatStatusTracker) registerInstance(hostname string, rm v2.Registration) {
   140  	h.mu.Lock()
   141  	defer h.mu.Unlock()
   142  
   143  	// Check if the instance has already been registered to avoid overwriting any
   144  	// Health/Prometheus data that already exists.
   145  	if instance, found := h.instances[hostname]; found {
   146  		instance.Registration = &rm
   147  		h.instances[hostname] = instance
   148  		return
   149  	}
   150  
   151  	h.instances[hostname] = v2.HeartbeatMessage{Registration: &rm}
   152  }
   153  
   154  func (h *heartbeatStatusTracker) updateHealth(hostname string, hm v2.Health) error {
   155  	h.mu.Lock()
   156  	defer h.mu.Unlock()
   157  
   158  	if instance, found := h.instances[hostname]; found {
   159  		instance.Health = &hm
   160  		h.instances[hostname] = instance
   161  		return nil
   162  	}
   163  
   164  	return fmt.Errorf("failed to find %s instance for health update", hostname)
   165  }
   166  
   167  // updatePrometheusMessage updates the v2.Prometheus field for a specific instance
   168  // in Memorystore and locally.
   169  func (h *heartbeatStatusTracker) updatePrometheusMessage(instance v2.HeartbeatMessage, pm *v2.Prometheus) error {
   170  	hostname := instance.Registration.Hostname
   171  	opts := &memorystore.PutOptions{FieldMustExist: "Registration", WithExpire: false}
   172  
   173  	// Update in Memorystore.
   174  	err := h.Put(hostname, "Prometheus", pm, opts)
   175  	if err != nil {
   176  		return err
   177  	}
   178  
   179  	// Update locally.
   180  	instance.Prometheus = pm
   181  	h.instances[hostname] = instance
   182  	return nil
   183  }
   184  
   185  func (h *heartbeatStatusTracker) importMemorystore() {
   186  	values, err := h.GetAll()
   187  
   188  	if err != nil {
   189  		metrics.ImportMemorystoreTotal.WithLabelValues(err.Error()).Inc()
   190  		return
   191  	}
   192  
   193  	metrics.ImportMemorystoreTotal.WithLabelValues("OK").Inc()
   194  	h.mu.Lock()
   195  	defer h.mu.Unlock()
   196  	h.instances = values
   197  	h.lastUpdate = time.Now()
   198  	h.updateMetrics()
   199  }
   200  
   201  // updateMetrics updates a Prometheus Gauge with the number of healthy instances per
   202  // experiment.
   203  // Note that if an experiment is deleted (i.e., there are no more experiment instances),
   204  // the metric will still report the last known count.
   205  func (h *heartbeatStatusTracker) updateMetrics() {
   206  	healthy := make(map[string]float64)
   207  	for _, instance := range h.instances {
   208  		if isHealthy(instance) {
   209  			healthy[instance.Registration.Experiment]++
   210  		}
   211  	}
   212  
   213  	for experiment, count := range healthy {
   214  		metrics.LocateHealthStatus.WithLabelValues(experiment).Set(count)
   215  	}
   216  }
   217  
   218  // constructPrometheusMessage constructs a v2.Prometheus message for a specific instance
   219  // from a map of hostname/machine Prometheus data.
   220  // If no information is available for the instance, it returns nil.
   221  func constructPrometheusMessage(instance v2.HeartbeatMessage, hostnames, machines map[string]bool) *v2.Prometheus {
   222  	if instance.Registration == nil {
   223  		return nil
   224  	}
   225  
   226  	var hostHealthy, hostFound, machineHealthy, machineFound bool
   227  
   228  	// Get Prometheus health data for the service hostname.
   229  	hostname := instance.Registration.Hostname
   230  	hostHealthy, hostFound = hostnames[hostname]
   231  
   232  	// Get Prometheus health data for the machine.
   233  	parts, err := host.Parse(hostname)
   234  	if err == nil {
   235  		machineHealthy, machineFound = machines[parts.String()]
   236  	}
   237  
   238  	// Create Prometheus health message.
   239  	if hostFound || machineFound {
   240  		// If Prometheus did not return any data about one of host or machine,
   241  		// treat it as healthy.
   242  		health := (!hostFound || hostHealthy) && (!machineFound || machineHealthy)
   243  		return &v2.Prometheus{Health: health}
   244  	}
   245  
   246  	// If no Prometheus data is available for either the host or machine (both missing),
   247  	// return nil. This case is treated the same way downstream as a healthy signal.
   248  	return nil
   249  }