github.com/thanos-io/thanos@v0.32.5/pkg/receive/config.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 package receive 5 6 import ( 7 "context" 8 "crypto/md5" 9 "encoding/binary" 10 "encoding/json" 11 "io" 12 "os" 13 "path/filepath" 14 "time" 15 16 "github.com/fsnotify/fsnotify" 17 "github.com/go-kit/log" 18 "github.com/go-kit/log/level" 19 "github.com/pkg/errors" 20 "github.com/prometheus/client_golang/prometheus" 21 "github.com/prometheus/client_golang/prometheus/promauto" 22 "github.com/prometheus/common/model" 23 ) 24 25 var ( 26 // An errParseConfigurationFile is returned by the ConfigWatcher when parsing failed. 27 errParseConfigurationFile = errors.New("configuration file is not parsable") 28 // An errEmptyConfigurationFile is returned by the ConfigWatcher when attempting to load an empty configuration file. 29 errEmptyConfigurationFile = errors.New("configuration file is empty") 30 ) 31 32 type ReceiverMode string 33 34 const ( 35 RouterOnly ReceiverMode = "RouterOnly" 36 IngestorOnly ReceiverMode = "IngestorOnly" 37 RouterIngestor ReceiverMode = "RouterIngestor" 38 ) 39 40 type Endpoint struct { 41 Address string `json:"address"` 42 AZ string `json:"az"` 43 } 44 45 func (e *Endpoint) UnmarshalJSON(data []byte) error { 46 // First try to unmarshal as a string. 47 err := json.Unmarshal(data, &e.Address) 48 if err == nil { 49 return nil 50 } 51 52 // If that fails, try to unmarshal as an endpoint object. 53 type endpointAlias Endpoint 54 var configEndpoint endpointAlias 55 err = json.Unmarshal(data, &configEndpoint) 56 if err == nil { 57 e.Address = configEndpoint.Address 58 e.AZ = configEndpoint.AZ 59 } 60 return err 61 } 62 63 // HashringConfig represents the configuration for a hashring 64 // a receive node knows about. 65 type HashringConfig struct { 66 Hashring string `json:"hashring,omitempty"` 67 Tenants []string `json:"tenants,omitempty"` 68 Endpoints []Endpoint `json:"endpoints"` 69 Algorithm HashringAlgorithm `json:"algorithm,omitempty"` 70 ExternalLabels map[string]string `json:"external_labels,omitempty"` 71 } 72 73 // ConfigWatcher is able to watch a file containing a hashring configuration 74 // for updates. 75 type ConfigWatcher struct { 76 ch chan []HashringConfig 77 path string 78 interval time.Duration 79 logger log.Logger 80 watcher *fsnotify.Watcher 81 82 hashGauge prometheus.Gauge 83 successGauge prometheus.Gauge 84 lastSuccessTimeGauge prometheus.Gauge 85 changesCounter prometheus.Counter 86 errorCounter prometheus.Counter 87 refreshCounter prometheus.Counter 88 hashringNodesGauge *prometheus.GaugeVec 89 hashringTenantsGauge *prometheus.GaugeVec 90 91 // lastLoadedConfigHash is the hash of the last successfully loaded configuration. 92 lastLoadedConfigHash float64 93 } 94 95 // NewConfigWatcher creates a new ConfigWatcher. 96 func NewConfigWatcher(logger log.Logger, reg prometheus.Registerer, path string, interval model.Duration) (*ConfigWatcher, error) { 97 if logger == nil { 98 logger = log.NewNopLogger() 99 } 100 101 watcher, err := fsnotify.NewWatcher() 102 if err != nil { 103 return nil, errors.Wrap(err, "creating file watcher") 104 } 105 if err := watcher.Add(path); err != nil { 106 return nil, errors.Wrapf(err, "adding path %s to file watcher", path) 107 } 108 109 c := &ConfigWatcher{ 110 ch: make(chan []HashringConfig), 111 path: path, 112 interval: time.Duration(interval), 113 logger: logger, 114 watcher: watcher, 115 hashGauge: promauto.With(reg).NewGauge( 116 prometheus.GaugeOpts{ 117 Name: "thanos_receive_config_hash", 118 Help: "Hash of the currently loaded hashring configuration file.", 119 }), 120 successGauge: promauto.With(reg).NewGauge( 121 prometheus.GaugeOpts{ 122 Name: "thanos_receive_config_last_reload_successful", 123 Help: "Whether the last hashring configuration file reload attempt was successful.", 124 }), 125 lastSuccessTimeGauge: promauto.With(reg).NewGauge( 126 prometheus.GaugeOpts{ 127 Name: "thanos_receive_config_last_reload_success_timestamp_seconds", 128 Help: "Timestamp of the last successful hashring configuration file reload.", 129 }), 130 changesCounter: promauto.With(reg).NewCounter( 131 prometheus.CounterOpts{ 132 Name: "thanos_receive_hashrings_file_changes_total", 133 Help: "The number of times the hashrings configuration file has changed.", 134 }), 135 errorCounter: promauto.With(reg).NewCounter( 136 prometheus.CounterOpts{ 137 Name: "thanos_receive_hashrings_file_errors_total", 138 Help: "The number of errors watching the hashrings configuration file.", 139 }), 140 refreshCounter: promauto.With(reg).NewCounter( 141 prometheus.CounterOpts{ 142 Name: "thanos_receive_hashrings_file_refreshes_total", 143 Help: "The number of refreshes of the hashrings configuration file.", 144 }), 145 hashringNodesGauge: promauto.With(reg).NewGaugeVec( 146 prometheus.GaugeOpts{ 147 Name: "thanos_receive_hashring_nodes", 148 Help: "The number of nodes per hashring.", 149 }, 150 []string{"name"}), 151 hashringTenantsGauge: promauto.With(reg).NewGaugeVec( 152 prometheus.GaugeOpts{ 153 Name: "thanos_receive_hashring_tenants", 154 Help: "The number of tenants per hashring.", 155 }, 156 []string{"name"}), 157 } 158 return c, nil 159 } 160 161 // Run starts the ConfigWatcher until the given context is canceled. 162 func (cw *ConfigWatcher) Run(ctx context.Context) { 163 defer cw.Stop() 164 165 cw.refresh(ctx) 166 167 ticker := time.NewTicker(cw.interval) 168 defer ticker.Stop() 169 170 for { 171 select { 172 case <-ctx.Done(): 173 return 174 175 case event := <-cw.watcher.Events: 176 // fsnotify sometimes sends a bunch of events without name or operation. 177 // It's unclear what they are and why they are sent - filter them out. 178 if event.Name == "" { 179 break 180 } 181 // Everything but a CHMOD requires rereading. 182 // If the file was removed, we can't read it, so skip. 183 if event.Op^(fsnotify.Chmod|fsnotify.Remove) == 0 { 184 break 185 } 186 // Changes to a file can spawn various sequences of events with 187 // different combinations of operations. For all practical purposes 188 // this is inaccurate. 189 // The most reliable solution is to reload everything if anything happens. 190 cw.refresh(ctx) 191 192 case <-ticker.C: 193 // Setting a new watch after an update might fail. Make sure we don't lose 194 // those files forever. 195 cw.refresh(ctx) 196 197 case err := <-cw.watcher.Errors: 198 if err != nil { 199 cw.errorCounter.Inc() 200 level.Error(cw.logger).Log("msg", "error watching file", "err", err) 201 } 202 } 203 } 204 } 205 206 // C returns a chan that gets hashring configuration updates. 207 func (cw *ConfigWatcher) C() <-chan []HashringConfig { 208 return cw.ch 209 } 210 211 // ValidateConfig returns an error if the configuration that's being watched is not valid. 212 func (cw *ConfigWatcher) ValidateConfig() error { 213 _, _, err := loadConfig(cw.logger, cw.path) 214 return err 215 } 216 217 // Stop shuts down the config watcher. 218 func (cw *ConfigWatcher) Stop() { 219 level.Debug(cw.logger).Log("msg", "stopping hashring configuration watcher...", "path", cw.path) 220 221 done := make(chan struct{}) 222 defer close(done) 223 224 // Closing the watcher will deadlock unless all events and errors are drained. 225 go func() { 226 for { 227 select { 228 case <-cw.watcher.Errors: 229 case <-cw.watcher.Events: 230 // Drain all events and errors. 231 case <-done: 232 return 233 } 234 } 235 }() 236 if err := cw.watcher.Close(); err != nil { 237 level.Error(cw.logger).Log("msg", "error closing file watcher", "path", cw.path, "err", err) 238 } 239 240 close(cw.ch) 241 level.Debug(cw.logger).Log("msg", "hashring configuration watcher stopped") 242 } 243 244 // refresh reads the configured file and sends the hashring configuration on the channel. 245 func (cw *ConfigWatcher) refresh(ctx context.Context) { 246 cw.refreshCounter.Inc() 247 248 config, cfgHash, err := loadConfig(cw.logger, cw.path) 249 if err != nil { 250 cw.errorCounter.Inc() 251 level.Error(cw.logger).Log("msg", "failed to load configuration file", "err", err, "path", cw.path) 252 return 253 } 254 255 // If there was no change to the configuration, return early. 256 if cw.lastLoadedConfigHash == cfgHash { 257 return 258 } 259 260 cw.changesCounter.Inc() 261 262 // Save the last known configuration. 263 cw.lastLoadedConfigHash = cfgHash 264 cw.hashGauge.Set(cfgHash) 265 cw.successGauge.Set(1) 266 cw.lastSuccessTimeGauge.SetToCurrentTime() 267 268 for _, c := range config { 269 cw.hashringNodesGauge.WithLabelValues(c.Hashring).Set(float64(len(c.Endpoints))) 270 cw.hashringTenantsGauge.WithLabelValues(c.Hashring).Set(float64(len(c.Tenants))) 271 } 272 273 level.Debug(cw.logger).Log("msg", "refreshed hashring config") 274 select { 275 case <-ctx.Done(): 276 return 277 case cw.ch <- config: 278 return 279 } 280 } 281 282 func ConfigFromWatcher(ctx context.Context, updates chan<- []HashringConfig, cw *ConfigWatcher) error { 283 defer close(updates) 284 go cw.Run(ctx) 285 286 for { 287 select { 288 case cfg, ok := <-cw.C(): 289 if !ok { 290 return errors.New("hashring config watcher stopped unexpectedly") 291 } 292 updates <- cfg 293 case <-ctx.Done(): 294 return ctx.Err() 295 } 296 } 297 } 298 299 // ParseConfig parses the raw configuration content and returns a HashringConfig. 300 func ParseConfig(content []byte) ([]HashringConfig, error) { 301 var config []HashringConfig 302 err := json.Unmarshal(content, &config) 303 return config, err 304 } 305 306 // loadConfig loads raw configuration content and returns a configuration. 307 func loadConfig(logger log.Logger, path string) ([]HashringConfig, float64, error) { 308 cfgContent, err := readFile(logger, path) 309 if err != nil { 310 return nil, 0, errors.Wrap(err, "failed to read configuration file") 311 } 312 313 config, err := ParseConfig(cfgContent) 314 if err != nil { 315 return nil, 0, errors.Wrapf(errParseConfigurationFile, "failed to parse configuration file: %v", err) 316 } 317 318 // If hashring is empty, return an error. 319 if len(config) == 0 { 320 return nil, 0, errors.Wrapf(errEmptyConfigurationFile, "failed to load configuration file, path: %s", path) 321 } 322 323 return config, hashAsMetricValue(cfgContent), nil 324 } 325 326 // readFile reads the configuration file and returns content of configuration file. 327 func readFile(logger log.Logger, path string) ([]byte, error) { 328 fd, err := os.Open(filepath.Clean(path)) 329 if err != nil { 330 return nil, err 331 } 332 defer func() { 333 if err := fd.Close(); err != nil { 334 level.Error(logger).Log("msg", "failed to close file", "err", err, "path", path) 335 } 336 }() 337 338 return io.ReadAll(fd) 339 } 340 341 // hashAsMetricValue generates metric value from hash of data. 342 func hashAsMetricValue(data []byte) float64 { 343 sum := md5.Sum(data) 344 // We only want 48 bits as a float64 only has a 53 bit mantissa. 345 smallSum := sum[0:6] 346 var bytes = make([]byte, 8) 347 copy(bytes, smallSum) 348 return float64(binary.LittleEndian.Uint64(bytes)) 349 }