github.com/m-lab/locate@v0.17.6/heartbeat/heartbeat.go (about) 1 package heartbeat 2 3 import ( 4 "errors" 5 "fmt" 6 "log" 7 "sync" 8 "time" 9 10 "github.com/gomodule/redigo/redis" 11 "github.com/m-lab/go/host" 12 v2 "github.com/m-lab/locate/api/v2" 13 "github.com/m-lab/locate/memorystore" 14 "github.com/m-lab/locate/metrics" 15 "github.com/m-lab/locate/static" 16 ) 17 18 var ( 19 errInvalidArgument = errors.New("argument is invalid") 20 errPrometheus = errors.New("error saving Prometheus entry") 21 ) 22 23 type heartbeatStatusTracker struct { 24 MemorystoreClient[v2.HeartbeatMessage] 25 instances map[string]v2.HeartbeatMessage 26 mu sync.RWMutex 27 stop chan bool 28 lastUpdate time.Time 29 } 30 31 // MemorystoreClient is a client for reading and writing data in Memorystore. 32 // The interface takes in a type argument which specifies the types of values 33 // that are stored and can be retrived. 34 type MemorystoreClient[V any] interface { 35 Put(key string, field string, value redis.Scanner, opts *memorystore.PutOptions) error 36 GetAll() (map[string]V, error) 37 } 38 39 // NewHeartbeatStatusTracker returns a new StatusTracker implementation that uses 40 // a Memorystore client to cache (and later import) instance data from the Heartbeat Service. 41 // StopImport() must be called to release resources. 42 func NewHeartbeatStatusTracker(client MemorystoreClient[v2.HeartbeatMessage]) *heartbeatStatusTracker { 43 h := &heartbeatStatusTracker{ 44 MemorystoreClient: client, 45 instances: make(map[string]v2.HeartbeatMessage), 46 stop: make(chan bool), 47 } 48 49 // Start import loop. 50 go func(h *heartbeatStatusTracker) { 51 ticker := *time.NewTicker(static.MemorystoreExportPeriod) 52 defer ticker.Stop() 53 54 for { 55 select { 56 case <-h.stop: 57 return 58 case <-ticker.C: 59 h.importMemorystore() 60 } 61 } 62 }(h) 63 64 return h 65 } 66 67 // RegisterInstance adds a new v2.Registration message to the Memorystore client and keeps it 68 // locally. 69 func (h *heartbeatStatusTracker) RegisterInstance(rm v2.Registration) error { 70 hostname := rm.Hostname 71 opts := &memorystore.PutOptions{WithExpire: true} 72 if err := h.Put(hostname, "Registration", &rm, opts); err != nil { 73 return fmt.Errorf("%w: failed to write Registration message to Memorystore", err) 74 } 75 76 h.registerInstance(hostname, rm) 77 return nil 78 } 79 80 // UpdateHealth updates the v2.Health field for the instance in the Memorystore client and 81 // updates it locally. 82 func (h *heartbeatStatusTracker) UpdateHealth(hostname string, hm v2.Health) error { 83 opts := &memorystore.PutOptions{FieldMustExist: "Registration", WithExpire: true} 84 if err := h.Put(hostname, "Health", &hm, opts); err != nil { 85 return fmt.Errorf("%w: failed to write Health message to Memorystore", err) 86 } 87 return h.updateHealth(hostname, hm) 88 } 89 90 // UpdatePrometheus updates the v2.Prometheus field for the instances. 91 func (h *heartbeatStatusTracker) UpdatePrometheus(hostnames, machines map[string]bool) error { 92 var err error 93 h.mu.Lock() 94 defer h.mu.Unlock() 95 96 for _, instance := range h.instances { 97 pm := constructPrometheusMessage(instance, hostnames, machines) 98 if pm != nil { 99 updateErr := h.updatePrometheusMessage(instance, pm) 100 101 if updateErr != nil { 102 log.Printf("Failed to write Prometheus message for instance %s to Memorystore: %v", instance.Registration.Hostname, updateErr) 103 err = errPrometheus 104 } 105 } 106 } 107 108 return err 109 } 110 111 // Instances returns a mapping of all the v2.HeartbeatMessage instance keys to 112 // their values. 113 func (h *heartbeatStatusTracker) Instances() map[string]v2.HeartbeatMessage { 114 h.mu.RLock() 115 defer h.mu.RUnlock() 116 117 c := make(map[string]v2.HeartbeatMessage, len(h.instances)) 118 for k, v := range h.instances { 119 c[k] = v 120 } 121 122 return c 123 } 124 125 // Ready reports whether the import to Memorystore has complete successfully 126 // within 2x the export period. 127 func (h *heartbeatStatusTracker) Ready() bool { 128 h.mu.RLock() 129 defer h.mu.RUnlock() 130 return time.Since(h.lastUpdate) <= 2*static.MemorystoreExportPeriod 131 } 132 133 // StopImport stops importing instance data from the Memorystore. 134 // It must be called to release resources. 135 func (h *heartbeatStatusTracker) StopImport() { 136 h.stop <- true 137 } 138 139 func (h *heartbeatStatusTracker) registerInstance(hostname string, rm v2.Registration) { 140 h.mu.Lock() 141 defer h.mu.Unlock() 142 143 // Check if the instance has already been registered to avoid overwriting any 144 // Health/Prometheus data that already exists. 145 if instance, found := h.instances[hostname]; found { 146 instance.Registration = &rm 147 h.instances[hostname] = instance 148 return 149 } 150 151 h.instances[hostname] = v2.HeartbeatMessage{Registration: &rm} 152 } 153 154 func (h *heartbeatStatusTracker) updateHealth(hostname string, hm v2.Health) error { 155 h.mu.Lock() 156 defer h.mu.Unlock() 157 158 if instance, found := h.instances[hostname]; found { 159 instance.Health = &hm 160 h.instances[hostname] = instance 161 return nil 162 } 163 164 return fmt.Errorf("failed to find %s instance for health update", hostname) 165 } 166 167 // updatePrometheusMessage updates the v2.Prometheus field for a specific instance 168 // in Memorystore and locally. 169 func (h *heartbeatStatusTracker) updatePrometheusMessage(instance v2.HeartbeatMessage, pm *v2.Prometheus) error { 170 hostname := instance.Registration.Hostname 171 opts := &memorystore.PutOptions{FieldMustExist: "Registration", WithExpire: false} 172 173 // Update in Memorystore. 174 err := h.Put(hostname, "Prometheus", pm, opts) 175 if err != nil { 176 return err 177 } 178 179 // Update locally. 180 instance.Prometheus = pm 181 h.instances[hostname] = instance 182 return nil 183 } 184 185 func (h *heartbeatStatusTracker) importMemorystore() { 186 values, err := h.GetAll() 187 188 if err != nil { 189 metrics.ImportMemorystoreTotal.WithLabelValues(err.Error()).Inc() 190 return 191 } 192 193 metrics.ImportMemorystoreTotal.WithLabelValues("OK").Inc() 194 h.mu.Lock() 195 defer h.mu.Unlock() 196 h.instances = values 197 h.lastUpdate = time.Now() 198 h.updateMetrics() 199 } 200 201 // updateMetrics updates a Prometheus Gauge with the number of healthy instances per 202 // experiment. 203 // Note that if an experiment is deleted (i.e., there are no more experiment instances), 204 // the metric will still report the last known count. 205 func (h *heartbeatStatusTracker) updateMetrics() { 206 healthy := make(map[string]float64) 207 for _, instance := range h.instances { 208 if isHealthy(instance) { 209 healthy[instance.Registration.Experiment]++ 210 } 211 } 212 213 for experiment, count := range healthy { 214 metrics.LocateHealthStatus.WithLabelValues(experiment).Set(count) 215 } 216 } 217 218 // constructPrometheusMessage constructs a v2.Prometheus message for a specific instance 219 // from a map of hostname/machine Prometheus data. 220 // If no information is available for the instance, it returns nil. 221 func constructPrometheusMessage(instance v2.HeartbeatMessage, hostnames, machines map[string]bool) *v2.Prometheus { 222 if instance.Registration == nil { 223 return nil 224 } 225 226 var hostHealthy, hostFound, machineHealthy, machineFound bool 227 228 // Get Prometheus health data for the service hostname. 229 hostname := instance.Registration.Hostname 230 hostHealthy, hostFound = hostnames[hostname] 231 232 // Get Prometheus health data for the machine. 233 parts, err := host.Parse(hostname) 234 if err == nil { 235 machineHealthy, machineFound = machines[parts.String()] 236 } 237 238 // Create Prometheus health message. 239 if hostFound || machineFound { 240 // If Prometheus did not return any data about one of host or machine, 241 // treat it as healthy. 242 health := (!hostFound || hostHealthy) && (!machineFound || machineHealthy) 243 return &v2.Prometheus{Health: health} 244 } 245 246 // If no Prometheus data is available for either the host or machine (both missing), 247 // return nil. This case is treated the same way downstream as a healthy signal. 248 return nil 249 }