github.com/thanos-io/thanos@v0.32.5/pkg/receive/config.go (about)

     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     3  
     4  package receive
     5  
     6  import (
     7  	"context"
     8  	"crypto/md5"
     9  	"encoding/binary"
    10  	"encoding/json"
    11  	"io"
    12  	"os"
    13  	"path/filepath"
    14  	"time"
    15  
    16  	"github.com/fsnotify/fsnotify"
    17  	"github.com/go-kit/log"
    18  	"github.com/go-kit/log/level"
    19  	"github.com/pkg/errors"
    20  	"github.com/prometheus/client_golang/prometheus"
    21  	"github.com/prometheus/client_golang/prometheus/promauto"
    22  	"github.com/prometheus/common/model"
    23  )
    24  
    25  var (
    26  	// An errParseConfigurationFile is returned by the ConfigWatcher when parsing failed.
    27  	errParseConfigurationFile = errors.New("configuration file is not parsable")
    28  	// An errEmptyConfigurationFile is returned by the ConfigWatcher when attempting to load an empty configuration file.
    29  	errEmptyConfigurationFile = errors.New("configuration file is empty")
    30  )
    31  
    32  type ReceiverMode string
    33  
    34  const (
    35  	RouterOnly     ReceiverMode = "RouterOnly"
    36  	IngestorOnly   ReceiverMode = "IngestorOnly"
    37  	RouterIngestor ReceiverMode = "RouterIngestor"
    38  )
    39  
    40  type Endpoint struct {
    41  	Address string `json:"address"`
    42  	AZ      string `json:"az"`
    43  }
    44  
    45  func (e *Endpoint) UnmarshalJSON(data []byte) error {
    46  	// First try to unmarshal as a string.
    47  	err := json.Unmarshal(data, &e.Address)
    48  	if err == nil {
    49  		return nil
    50  	}
    51  
    52  	// If that fails, try to unmarshal as an endpoint object.
    53  	type endpointAlias Endpoint
    54  	var configEndpoint endpointAlias
    55  	err = json.Unmarshal(data, &configEndpoint)
    56  	if err == nil {
    57  		e.Address = configEndpoint.Address
    58  		e.AZ = configEndpoint.AZ
    59  	}
    60  	return err
    61  }
    62  
    63  // HashringConfig represents the configuration for a hashring
    64  // a receive node knows about.
    65  type HashringConfig struct {
    66  	Hashring       string            `json:"hashring,omitempty"`
    67  	Tenants        []string          `json:"tenants,omitempty"`
    68  	Endpoints      []Endpoint        `json:"endpoints"`
    69  	Algorithm      HashringAlgorithm `json:"algorithm,omitempty"`
    70  	ExternalLabels map[string]string `json:"external_labels,omitempty"`
    71  }
    72  
    73  // ConfigWatcher is able to watch a file containing a hashring configuration
    74  // for updates.
    75  type ConfigWatcher struct {
    76  	ch       chan []HashringConfig
    77  	path     string
    78  	interval time.Duration
    79  	logger   log.Logger
    80  	watcher  *fsnotify.Watcher
    81  
    82  	hashGauge            prometheus.Gauge
    83  	successGauge         prometheus.Gauge
    84  	lastSuccessTimeGauge prometheus.Gauge
    85  	changesCounter       prometheus.Counter
    86  	errorCounter         prometheus.Counter
    87  	refreshCounter       prometheus.Counter
    88  	hashringNodesGauge   *prometheus.GaugeVec
    89  	hashringTenantsGauge *prometheus.GaugeVec
    90  
    91  	// lastLoadedConfigHash is the hash of the last successfully loaded configuration.
    92  	lastLoadedConfigHash float64
    93  }
    94  
    95  // NewConfigWatcher creates a new ConfigWatcher.
    96  func NewConfigWatcher(logger log.Logger, reg prometheus.Registerer, path string, interval model.Duration) (*ConfigWatcher, error) {
    97  	if logger == nil {
    98  		logger = log.NewNopLogger()
    99  	}
   100  
   101  	watcher, err := fsnotify.NewWatcher()
   102  	if err != nil {
   103  		return nil, errors.Wrap(err, "creating file watcher")
   104  	}
   105  	if err := watcher.Add(path); err != nil {
   106  		return nil, errors.Wrapf(err, "adding path %s to file watcher", path)
   107  	}
   108  
   109  	c := &ConfigWatcher{
   110  		ch:       make(chan []HashringConfig),
   111  		path:     path,
   112  		interval: time.Duration(interval),
   113  		logger:   logger,
   114  		watcher:  watcher,
   115  		hashGauge: promauto.With(reg).NewGauge(
   116  			prometheus.GaugeOpts{
   117  				Name: "thanos_receive_config_hash",
   118  				Help: "Hash of the currently loaded hashring configuration file.",
   119  			}),
   120  		successGauge: promauto.With(reg).NewGauge(
   121  			prometheus.GaugeOpts{
   122  				Name: "thanos_receive_config_last_reload_successful",
   123  				Help: "Whether the last hashring configuration file reload attempt was successful.",
   124  			}),
   125  		lastSuccessTimeGauge: promauto.With(reg).NewGauge(
   126  			prometheus.GaugeOpts{
   127  				Name: "thanos_receive_config_last_reload_success_timestamp_seconds",
   128  				Help: "Timestamp of the last successful hashring configuration file reload.",
   129  			}),
   130  		changesCounter: promauto.With(reg).NewCounter(
   131  			prometheus.CounterOpts{
   132  				Name: "thanos_receive_hashrings_file_changes_total",
   133  				Help: "The number of times the hashrings configuration file has changed.",
   134  			}),
   135  		errorCounter: promauto.With(reg).NewCounter(
   136  			prometheus.CounterOpts{
   137  				Name: "thanos_receive_hashrings_file_errors_total",
   138  				Help: "The number of errors watching the hashrings configuration file.",
   139  			}),
   140  		refreshCounter: promauto.With(reg).NewCounter(
   141  			prometheus.CounterOpts{
   142  				Name: "thanos_receive_hashrings_file_refreshes_total",
   143  				Help: "The number of refreshes of the hashrings configuration file.",
   144  			}),
   145  		hashringNodesGauge: promauto.With(reg).NewGaugeVec(
   146  			prometheus.GaugeOpts{
   147  				Name: "thanos_receive_hashring_nodes",
   148  				Help: "The number of nodes per hashring.",
   149  			},
   150  			[]string{"name"}),
   151  		hashringTenantsGauge: promauto.With(reg).NewGaugeVec(
   152  			prometheus.GaugeOpts{
   153  				Name: "thanos_receive_hashring_tenants",
   154  				Help: "The number of tenants per hashring.",
   155  			},
   156  			[]string{"name"}),
   157  	}
   158  	return c, nil
   159  }
   160  
   161  // Run starts the ConfigWatcher until the given context is canceled.
   162  func (cw *ConfigWatcher) Run(ctx context.Context) {
   163  	defer cw.Stop()
   164  
   165  	cw.refresh(ctx)
   166  
   167  	ticker := time.NewTicker(cw.interval)
   168  	defer ticker.Stop()
   169  
   170  	for {
   171  		select {
   172  		case <-ctx.Done():
   173  			return
   174  
   175  		case event := <-cw.watcher.Events:
   176  			// fsnotify sometimes sends a bunch of events without name or operation.
   177  			// It's unclear what they are and why they are sent - filter them out.
   178  			if event.Name == "" {
   179  				break
   180  			}
   181  			// Everything but a CHMOD requires rereading.
   182  			// If the file was removed, we can't read it, so skip.
   183  			if event.Op^(fsnotify.Chmod|fsnotify.Remove) == 0 {
   184  				break
   185  			}
   186  			// Changes to a file can spawn various sequences of events with
   187  			// different combinations of operations. For all practical purposes
   188  			// this is inaccurate.
   189  			// The most reliable solution is to reload everything if anything happens.
   190  			cw.refresh(ctx)
   191  
   192  		case <-ticker.C:
   193  			// Setting a new watch after an update might fail. Make sure we don't lose
   194  			// those files forever.
   195  			cw.refresh(ctx)
   196  
   197  		case err := <-cw.watcher.Errors:
   198  			if err != nil {
   199  				cw.errorCounter.Inc()
   200  				level.Error(cw.logger).Log("msg", "error watching file", "err", err)
   201  			}
   202  		}
   203  	}
   204  }
   205  
   206  // C returns a chan that gets hashring configuration updates.
   207  func (cw *ConfigWatcher) C() <-chan []HashringConfig {
   208  	return cw.ch
   209  }
   210  
   211  // ValidateConfig returns an error if the configuration that's being watched is not valid.
   212  func (cw *ConfigWatcher) ValidateConfig() error {
   213  	_, _, err := loadConfig(cw.logger, cw.path)
   214  	return err
   215  }
   216  
   217  // Stop shuts down the config watcher.
   218  func (cw *ConfigWatcher) Stop() {
   219  	level.Debug(cw.logger).Log("msg", "stopping hashring configuration watcher...", "path", cw.path)
   220  
   221  	done := make(chan struct{})
   222  	defer close(done)
   223  
   224  	// Closing the watcher will deadlock unless all events and errors are drained.
   225  	go func() {
   226  		for {
   227  			select {
   228  			case <-cw.watcher.Errors:
   229  			case <-cw.watcher.Events:
   230  			// Drain all events and errors.
   231  			case <-done:
   232  				return
   233  			}
   234  		}
   235  	}()
   236  	if err := cw.watcher.Close(); err != nil {
   237  		level.Error(cw.logger).Log("msg", "error closing file watcher", "path", cw.path, "err", err)
   238  	}
   239  
   240  	close(cw.ch)
   241  	level.Debug(cw.logger).Log("msg", "hashring configuration watcher stopped")
   242  }
   243  
   244  // refresh reads the configured file and sends the hashring configuration on the channel.
   245  func (cw *ConfigWatcher) refresh(ctx context.Context) {
   246  	cw.refreshCounter.Inc()
   247  
   248  	config, cfgHash, err := loadConfig(cw.logger, cw.path)
   249  	if err != nil {
   250  		cw.errorCounter.Inc()
   251  		level.Error(cw.logger).Log("msg", "failed to load configuration file", "err", err, "path", cw.path)
   252  		return
   253  	}
   254  
   255  	// If there was no change to the configuration, return early.
   256  	if cw.lastLoadedConfigHash == cfgHash {
   257  		return
   258  	}
   259  
   260  	cw.changesCounter.Inc()
   261  
   262  	// Save the last known configuration.
   263  	cw.lastLoadedConfigHash = cfgHash
   264  	cw.hashGauge.Set(cfgHash)
   265  	cw.successGauge.Set(1)
   266  	cw.lastSuccessTimeGauge.SetToCurrentTime()
   267  
   268  	for _, c := range config {
   269  		cw.hashringNodesGauge.WithLabelValues(c.Hashring).Set(float64(len(c.Endpoints)))
   270  		cw.hashringTenantsGauge.WithLabelValues(c.Hashring).Set(float64(len(c.Tenants)))
   271  	}
   272  
   273  	level.Debug(cw.logger).Log("msg", "refreshed hashring config")
   274  	select {
   275  	case <-ctx.Done():
   276  		return
   277  	case cw.ch <- config:
   278  		return
   279  	}
   280  }
   281  
   282  func ConfigFromWatcher(ctx context.Context, updates chan<- []HashringConfig, cw *ConfigWatcher) error {
   283  	defer close(updates)
   284  	go cw.Run(ctx)
   285  
   286  	for {
   287  		select {
   288  		case cfg, ok := <-cw.C():
   289  			if !ok {
   290  				return errors.New("hashring config watcher stopped unexpectedly")
   291  			}
   292  			updates <- cfg
   293  		case <-ctx.Done():
   294  			return ctx.Err()
   295  		}
   296  	}
   297  }
   298  
   299  // ParseConfig parses the raw configuration content and returns a HashringConfig.
   300  func ParseConfig(content []byte) ([]HashringConfig, error) {
   301  	var config []HashringConfig
   302  	err := json.Unmarshal(content, &config)
   303  	return config, err
   304  }
   305  
   306  // loadConfig loads raw configuration content and returns a configuration.
   307  func loadConfig(logger log.Logger, path string) ([]HashringConfig, float64, error) {
   308  	cfgContent, err := readFile(logger, path)
   309  	if err != nil {
   310  		return nil, 0, errors.Wrap(err, "failed to read configuration file")
   311  	}
   312  
   313  	config, err := ParseConfig(cfgContent)
   314  	if err != nil {
   315  		return nil, 0, errors.Wrapf(errParseConfigurationFile, "failed to parse configuration file: %v", err)
   316  	}
   317  
   318  	// If hashring is empty, return an error.
   319  	if len(config) == 0 {
   320  		return nil, 0, errors.Wrapf(errEmptyConfigurationFile, "failed to load configuration file, path: %s", path)
   321  	}
   322  
   323  	return config, hashAsMetricValue(cfgContent), nil
   324  }
   325  
   326  // readFile reads the configuration file and returns content of configuration file.
   327  func readFile(logger log.Logger, path string) ([]byte, error) {
   328  	fd, err := os.Open(filepath.Clean(path))
   329  	if err != nil {
   330  		return nil, err
   331  	}
   332  	defer func() {
   333  		if err := fd.Close(); err != nil {
   334  			level.Error(logger).Log("msg", "failed to close file", "err", err, "path", path)
   335  		}
   336  	}()
   337  
   338  	return io.ReadAll(fd)
   339  }
   340  
   341  // hashAsMetricValue generates metric value from hash of data.
   342  func hashAsMetricValue(data []byte) float64 {
   343  	sum := md5.Sum(data)
   344  	// We only want 48 bits as a float64 only has a 53 bit mantissa.
   345  	smallSum := sum[0:6]
   346  	var bytes = make([]byte, 8)
   347  	copy(bytes, smallSum)
   348  	return float64(binary.LittleEndian.Uint64(bytes))
   349  }