github.com/thanos-io/thanos@v0.32.5/pkg/reloader/reloader.go (about)

     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     3  
     4  // Package reloader contains helpers to trigger reloads of Prometheus instances
     5  // on configuration changes and to substitute environment variables in config files.
     6  //
     7  // Reloader type is useful when you want to:
     8  //
     9  //   - Watch on changes against certain file e.g (`cfgFile`).
    10  //   - Optionally, specify different output file for watched `cfgFile` (`cfgOutputFile`).
    11  //     This will also try decompress the `cfgFile` if needed and substitute ALL the envvars using Kubernetes substitution format: (`$(var)`)
    12  //   - Watch on changes against certain directories (`watchedDirs`).
    13  //
    14  // Once any of those two changes, Prometheus on given `reloadURL` will be notified, causing Prometheus to reload configuration and rules.
    15  //
    16  // This and below for reloader:
    17  //
    18  //		u, _ := url.Parse("http://localhost:9090")
    19  //		rl := reloader.New(nil, nil, &reloader.Options{
    20  //			ReloadURL:     reloader.ReloadURLFromBase(u),
    21  //			CfgFile:       "/path/to/cfg",
    22  //			CfgOutputFile: "/path/to/cfg.out",
    23  //			WatchedDirs:      []string{"/path/to/dirs"},
    24  //			WatchInterval: 3 * time.Minute,
    25  //			RetryInterval: 5 * time.Second,
    26  //	 })
    27  //
    28  // The url of reloads can be generated with function ReloadURLFromBase().
    29  // It will append the default path of reload into the given url:
    30  //
    31  //	u, _ := url.Parse("http://localhost:9090")
    32  //	reloader.ReloadURLFromBase(u) // It will return "http://localhost:9090/-/reload"
    33  //
    34  // Start watching changes and stopped until the context gets canceled:
    35  //
    36  //	ctx, cancel := context.WithCancel(context.Background())
    37  //	go func() {
    38  //		if err := rl.Watch(ctx); err != nil {
    39  //			log.Fatal(err)
    40  //		}
    41  //	}()
    42  //	// ...
    43  //	cancel()
    44  //
    45  // Reloader will make a schedule to check the given config files and dirs of sum of hash with the last result,
    46  // even if it is no changes.
    47  //
    48  // A basic example of configuration template with environment variables:
    49  //
    50  //	global:
    51  //	  external_labels:
    52  //	    replica: '$(HOSTNAME)'
    53  package reloader
    54  
    55  import (
    56  	"bytes"
    57  	"compress/gzip"
    58  	"context"
    59  	"hash"
    60  	"io"
    61  	"net/http"
    62  	"net/url"
    63  	"os"
    64  	"path"
    65  	"path/filepath"
    66  	"regexp"
    67  	"strings"
    68  	"sync"
    69  	"time"
    70  
    71  	"github.com/fsnotify/fsnotify"
    72  	"github.com/go-kit/log"
    73  	"github.com/go-kit/log/level"
    74  	"github.com/minio/sha256-simd"
    75  	"github.com/pkg/errors"
    76  	"github.com/prometheus/client_golang/prometheus"
    77  	"github.com/prometheus/client_golang/prometheus/promauto"
    78  
    79  	"github.com/thanos-io/thanos/pkg/runutil"
    80  )
    81  
    82  // Reloader can watch config files and trigger reloads of a Prometheus server.
    83  // It optionally substitutes environment variables in the configuration.
    84  // Referenced environment variables must be of the form `$(var)` (not `$var` or `${var}`).
    85  type Reloader struct {
    86  	logger        log.Logger
    87  	reloadURL     *url.URL
    88  	httpClient    http.Client
    89  	cfgFile       string
    90  	cfgOutputFile string
    91  	watchInterval time.Duration
    92  	retryInterval time.Duration
    93  	watchedDirs   []string
    94  	watcher       *watcher
    95  
    96  	lastCfgHash         []byte
    97  	lastWatchedDirsHash []byte
    98  	forceReload         bool
    99  
   100  	reloads                    prometheus.Counter
   101  	reloadErrors               prometheus.Counter
   102  	lastReloadSuccess          prometheus.Gauge
   103  	lastReloadSuccessTimestamp prometheus.Gauge
   104  	configApplyErrors          prometheus.Counter
   105  	configApply                prometheus.Counter
   106  }
   107  
   108  // Options bundles options for the Reloader.
   109  type Options struct {
   110  	// ReloadURL is a prometheus URL to trigger reloads.
   111  	ReloadURL *url.URL
   112  	// CfgFile is a path to the prometheus config file to watch.
   113  	CfgFile string
   114  	// CfgOutputFile is a path for the output config file.
   115  	// If cfgOutputFile is not empty the config file will be decompressed if needed, environment variables
   116  	// will be substituted and the output written into the given path. Prometheus should then use
   117  	// cfgOutputFile as its config file path.
   118  	CfgOutputFile string
   119  	// WatchedDirs is a collection of paths for the reloader to watch over.
   120  	WatchedDirs []string
   121  	// DelayInterval controls how long the reloader will wait without receiving
   122  	// new file-system events before it applies the reload.
   123  	DelayInterval time.Duration
   124  	// WatchInterval controls how often reloader re-reads config and directories.
   125  	WatchInterval time.Duration
   126  	// RetryInterval controls how often the reloader retries a reloading of the
   127  	// configuration in case the endpoint returned an error.
   128  	RetryInterval time.Duration
   129  }
   130  
   131  var firstGzipBytes = []byte{0x1f, 0x8b, 0x08}
   132  
   133  // New creates a new reloader that watches the given config file and directories
   134  // and triggers a Prometheus reload upon changes.
   135  func New(logger log.Logger, reg prometheus.Registerer, o *Options) *Reloader {
   136  	if logger == nil {
   137  		logger = log.NewNopLogger()
   138  	}
   139  	r := &Reloader{
   140  		logger:        logger,
   141  		reloadURL:     o.ReloadURL,
   142  		cfgFile:       o.CfgFile,
   143  		cfgOutputFile: o.CfgOutputFile,
   144  		watcher:       newWatcher(logger, reg, o.DelayInterval),
   145  		watchedDirs:   o.WatchedDirs,
   146  		watchInterval: o.WatchInterval,
   147  		retryInterval: o.RetryInterval,
   148  
   149  		reloads: promauto.With(reg).NewCounter(
   150  			prometheus.CounterOpts{
   151  				Name: "reloader_reloads_total",
   152  				Help: "Total number of reload requests.",
   153  			},
   154  		),
   155  		reloadErrors: promauto.With(reg).NewCounter(
   156  			prometheus.CounterOpts{
   157  				Name: "reloader_reloads_failed_total",
   158  				Help: "Total number of reload requests that failed.",
   159  			},
   160  		),
   161  		lastReloadSuccess: promauto.With(reg).NewGauge(
   162  			prometheus.GaugeOpts{
   163  				Name: "reloader_last_reload_successful",
   164  				Help: "Whether the last reload attempt was successful",
   165  			},
   166  		),
   167  		lastReloadSuccessTimestamp: promauto.With(reg).NewGauge(
   168  			prometheus.GaugeOpts{
   169  				Name: "reloader_last_reload_success_timestamp_seconds",
   170  				Help: "Timestamp of the last successful reload",
   171  			},
   172  		),
   173  		configApply: promauto.With(reg).NewCounter(
   174  			prometheus.CounterOpts{
   175  				Name: "reloader_config_apply_operations_total",
   176  				Help: "Total number of config apply operations.",
   177  			},
   178  		),
   179  		configApplyErrors: promauto.With(reg).NewCounter(
   180  			prometheus.CounterOpts{
   181  				Name: "reloader_config_apply_operations_failed_total",
   182  				Help: "Total number of config apply operations that failed.",
   183  			},
   184  		),
   185  	}
   186  	return r
   187  }
   188  
   189  // Watch detects any change made to the watched config file and directories. It
   190  // returns when the context is canceled.
   191  // Whenever a filesystem change is detected or the watch interval has elapsed,
   192  // the reloader expands the config file (if cfgOutputFile is specified) and
   193  // triggers a reload if the configuration file or files in the watched
   194  // directories have changed.
   195  // Because some edge cases might be missing, the reloader also relies on the
   196  // watch interval.
   197  func (r *Reloader) Watch(ctx context.Context) error {
   198  	if r.cfgFile == "" && len(r.watchedDirs) == 0 {
   199  		level.Info(r.logger).Log("msg", "nothing to be watched")
   200  		<-ctx.Done()
   201  		return nil
   202  	}
   203  
   204  	defer runutil.CloseWithLogOnErr(r.logger, r.watcher, "config watcher close")
   205  
   206  	if r.cfgFile != "" {
   207  		if err := r.watcher.addFile(r.cfgFile); err != nil {
   208  			return errors.Wrapf(err, "add config file %s to watcher", r.cfgFile)
   209  		}
   210  		initialSyncCtx, initialSyncCancel := context.WithTimeout(ctx, r.watchInterval)
   211  		err := r.apply(initialSyncCtx)
   212  		initialSyncCancel()
   213  		if err != nil {
   214  			return err
   215  		}
   216  	}
   217  
   218  	if r.watchInterval == 0 {
   219  		// Skip watching the file-system.
   220  		return nil
   221  	}
   222  
   223  	for _, dir := range r.watchedDirs {
   224  		if err := r.watcher.addDirectory(dir); err != nil {
   225  			return errors.Wrapf(err, "add directory %s to watcher", dir)
   226  		}
   227  	}
   228  
   229  	// Start watching the file-system.
   230  	var wg sync.WaitGroup
   231  	wg.Add(1)
   232  	go func() {
   233  		r.watcher.run(ctx)
   234  		wg.Done()
   235  	}()
   236  
   237  	level.Info(r.logger).Log(
   238  		"msg", "started watching config file and directories for changes",
   239  		"cfg", r.cfgFile,
   240  		"out", r.cfgOutputFile,
   241  		"dirs", strings.Join(r.watchedDirs, ","))
   242  
   243  	applyCtx, applyCancel := context.WithTimeout(ctx, r.watchInterval)
   244  
   245  	for {
   246  		select {
   247  		case <-applyCtx.Done():
   248  			if ctx.Err() != nil {
   249  				applyCancel()
   250  				wg.Wait()
   251  				return nil
   252  			}
   253  		case <-r.watcher.notify:
   254  		}
   255  
   256  		// Reset the watch timeout.
   257  		applyCancel()
   258  		applyCtx, applyCancel = context.WithTimeout(ctx, r.watchInterval)
   259  
   260  		r.configApply.Inc()
   261  		if err := r.apply(applyCtx); err != nil {
   262  			r.configApplyErrors.Inc()
   263  			level.Error(r.logger).Log("msg", "apply error", "err", err)
   264  			continue
   265  		}
   266  	}
   267  }
   268  
   269  // apply triggers Prometheus reload if rules or config changed. If cfgOutputFile is set, we also
   270  // expand env vars into config file before reloading.
   271  // Reload is retried in retryInterval until watchInterval.
   272  func (r *Reloader) apply(ctx context.Context) error {
   273  	var (
   274  		cfgHash         []byte
   275  		watchedDirsHash []byte
   276  	)
   277  	if r.cfgFile != "" {
   278  		h := sha256.New()
   279  		if err := hashFile(h, r.cfgFile); err != nil {
   280  			return errors.Wrap(err, "hash file")
   281  		}
   282  		cfgHash = h.Sum(nil)
   283  		if r.cfgOutputFile != "" {
   284  			b, err := os.ReadFile(r.cfgFile)
   285  			if err != nil {
   286  				return errors.Wrap(err, "read file")
   287  			}
   288  
   289  			// Detect and extract gzipped file.
   290  			if bytes.Equal(b[0:3], firstGzipBytes) {
   291  				zr, err := gzip.NewReader(bytes.NewReader(b))
   292  				if err != nil {
   293  					return errors.Wrap(err, "create gzip reader")
   294  				}
   295  				defer runutil.CloseWithLogOnErr(r.logger, zr, "gzip reader close")
   296  
   297  				b, err = io.ReadAll(zr)
   298  				if err != nil {
   299  					return errors.Wrap(err, "read compressed config file")
   300  				}
   301  			}
   302  
   303  			b, err = expandEnv(b)
   304  			if err != nil {
   305  				return errors.Wrap(err, "expand environment variables")
   306  			}
   307  
   308  			tmpFile := r.cfgOutputFile + ".tmp"
   309  			defer func() {
   310  				_ = os.Remove(tmpFile)
   311  			}()
   312  			if err := os.WriteFile(tmpFile, b, 0644); err != nil {
   313  				return errors.Wrap(err, "write file")
   314  			}
   315  			if err := os.Rename(tmpFile, r.cfgOutputFile); err != nil {
   316  				return errors.Wrap(err, "rename file")
   317  			}
   318  		}
   319  	}
   320  
   321  	h := sha256.New()
   322  	for _, dir := range r.watchedDirs {
   323  		walkDir, err := filepath.EvalSymlinks(dir)
   324  		if err != nil {
   325  			return errors.Wrap(err, "dir symlink eval")
   326  		}
   327  		err = filepath.Walk(walkDir, func(path string, f os.FileInfo, err error) error {
   328  			if err != nil {
   329  				return err
   330  			}
   331  
   332  			// filepath.Walk uses Lstat to retrieve os.FileInfo. Lstat does not
   333  			// follow symlinks. Make sure to follow a symlink before checking
   334  			// if it is a directory.
   335  			targetFile, err := os.Stat(path)
   336  			if err != nil {
   337  				return err
   338  			}
   339  
   340  			if targetFile.IsDir() {
   341  				return nil
   342  			}
   343  
   344  			if err := hashFile(h, path); err != nil {
   345  				return err
   346  			}
   347  			return nil
   348  		})
   349  		if err != nil {
   350  			return errors.Wrap(err, "build hash")
   351  		}
   352  	}
   353  	if len(r.watchedDirs) > 0 {
   354  		watchedDirsHash = h.Sum(nil)
   355  	}
   356  
   357  	if !r.forceReload && bytes.Equal(r.lastCfgHash, cfgHash) && bytes.Equal(r.lastWatchedDirsHash, watchedDirsHash) {
   358  		// Nothing to do.
   359  		return nil
   360  	}
   361  
   362  	if err := runutil.RetryWithLog(r.logger, r.retryInterval, ctx.Done(), func() error {
   363  		if r.watchInterval == 0 {
   364  			return nil
   365  		}
   366  		r.reloads.Inc()
   367  		if err := r.triggerReload(ctx); err != nil {
   368  			r.reloadErrors.Inc()
   369  			r.lastReloadSuccess.Set(0)
   370  			return errors.Wrap(err, "trigger reload")
   371  		}
   372  
   373  		r.forceReload = false
   374  		r.lastCfgHash = cfgHash
   375  		r.lastWatchedDirsHash = watchedDirsHash
   376  		level.Info(r.logger).Log(
   377  			"msg", "Reload triggered",
   378  			"cfg_in", r.cfgFile,
   379  			"cfg_out", r.cfgOutputFile,
   380  			"watched_dirs", strings.Join(r.watchedDirs, ", "))
   381  		r.lastReloadSuccess.Set(1)
   382  		r.lastReloadSuccessTimestamp.SetToCurrentTime()
   383  		return nil
   384  	}); err != nil {
   385  		r.forceReload = true
   386  		level.Error(r.logger).Log("msg", "Failed to trigger reload. Retrying.", "err", err)
   387  	}
   388  
   389  	return nil
   390  }
   391  
   392  func hashFile(h hash.Hash, fn string) error {
   393  	f, err := os.Open(filepath.Clean(fn))
   394  	if err != nil {
   395  		return err
   396  	}
   397  	defer runutil.CloseWithErrCapture(&err, f, "close file")
   398  
   399  	if _, err := h.Write([]byte{'\xff'}); err != nil {
   400  		return err
   401  	}
   402  	if _, err := h.Write([]byte(fn)); err != nil {
   403  		return err
   404  	}
   405  	if _, err := h.Write([]byte{'\xff'}); err != nil {
   406  		return err
   407  	}
   408  
   409  	if _, err := io.Copy(h, f); err != nil {
   410  		return err
   411  	}
   412  	return nil
   413  }
   414  
   415  func (r *Reloader) triggerReload(ctx context.Context) error {
   416  	req, err := http.NewRequest("POST", r.reloadURL.String(), nil)
   417  	if err != nil {
   418  		return errors.Wrap(err, "create request")
   419  	}
   420  	req = req.WithContext(ctx)
   421  
   422  	resp, err := r.httpClient.Do(req)
   423  	if err != nil {
   424  		return errors.Wrap(err, "reload request failed")
   425  	}
   426  	defer runutil.ExhaustCloseWithLogOnErr(r.logger, resp.Body, "trigger reload resp body")
   427  
   428  	if resp.StatusCode != 200 {
   429  		return errors.Errorf("received non-200 response: %s; have you set `--web.enable-lifecycle` Prometheus flag?", resp.Status)
   430  	}
   431  	return nil
   432  }
   433  
   434  // SetHttpClient sets Http client for reloader.
   435  func (r *Reloader) SetHttpClient(client http.Client) {
   436  	r.httpClient = client
   437  }
   438  
   439  // ReloadURLFromBase returns the standard Prometheus reload URL from its base URL.
   440  func ReloadURLFromBase(u *url.URL) *url.URL {
   441  	r := *u
   442  	r.Path = path.Join(r.Path, "/-/reload")
   443  	return &r
   444  }
   445  
   446  var envRe = regexp.MustCompile(`\$\(([a-zA-Z_0-9]+)\)`)
   447  
   448  func expandEnv(b []byte) (r []byte, err error) {
   449  	r = envRe.ReplaceAllFunc(b, func(n []byte) []byte {
   450  		if err != nil {
   451  			return nil
   452  		}
   453  		n = n[2 : len(n)-1]
   454  
   455  		v, ok := os.LookupEnv(string(n))
   456  		if !ok {
   457  			err = errors.Errorf("found reference to unset environment variable %q", n)
   458  			return nil
   459  		}
   460  		return []byte(v)
   461  	})
   462  	return r, err
   463  }
   464  
   465  type watcher struct {
   466  	notify chan struct{}
   467  
   468  	w             *fsnotify.Watcher
   469  	watchedDirs   map[string]struct{}
   470  	delayInterval time.Duration
   471  
   472  	logger       log.Logger
   473  	watchedItems prometheus.Gauge
   474  	watchEvents  prometheus.Counter
   475  	watchErrors  prometheus.Counter
   476  }
   477  
   478  func newWatcher(logger log.Logger, reg prometheus.Registerer, delayInterval time.Duration) *watcher {
   479  	return &watcher{
   480  		logger:        logger,
   481  		delayInterval: delayInterval,
   482  		notify:        make(chan struct{}),
   483  		watchedDirs:   make(map[string]struct{}),
   484  
   485  		watchedItems: promauto.With(reg).NewGauge(
   486  			prometheus.GaugeOpts{
   487  				Name: "reloader_watches",
   488  				Help: "Number of resources watched by the reloader.",
   489  			},
   490  		),
   491  		watchEvents: promauto.With(reg).NewCounter(
   492  			prometheus.CounterOpts{
   493  				Name: "reloader_watch_events_total",
   494  				Help: "Total number of events received by the reloader from the watcher.",
   495  			},
   496  		),
   497  		watchErrors: promauto.With(reg).NewCounter(
   498  			prometheus.CounterOpts{
   499  				Name: "reloader_watch_errors_total",
   500  				Help: "Total number of errors received by the reloader from the watcher.",
   501  			},
   502  		),
   503  	}
   504  }
   505  
   506  // Close implements the io.Closer interface.
   507  func (w *watcher) Close() error {
   508  	if w.w == nil {
   509  		return nil
   510  	}
   511  	watcher := w.w
   512  	w.w = nil
   513  	return watcher.Close()
   514  }
   515  
   516  func (w *watcher) addPath(name string) error {
   517  	if w.w == nil {
   518  		fsWatcher, err := fsnotify.NewWatcher()
   519  		if err != nil {
   520  			return errors.Wrap(err, "create watcher")
   521  		}
   522  		w.w = fsWatcher
   523  	}
   524  
   525  	if err := w.w.Add(name); err != nil {
   526  		return err
   527  	}
   528  
   529  	w.watchedDirs[name] = struct{}{}
   530  	w.watchedItems.Set(float64(len(w.watchedDirs)))
   531  
   532  	return nil
   533  }
   534  
   535  func (w *watcher) addDirectory(name string) error {
   536  	w.watchedDirs[name] = struct{}{}
   537  	return w.addPath(name)
   538  }
   539  
   540  func (w *watcher) addFile(name string) error {
   541  	w.watchedDirs[filepath.Dir(name)] = struct{}{}
   542  	return w.addPath(name)
   543  }
   544  
   545  func (w *watcher) run(ctx context.Context) {
   546  	defer runutil.CloseWithLogOnErr(w.logger, w.w, "config watcher close")
   547  
   548  	var (
   549  		wg     sync.WaitGroup
   550  		notify = make(chan struct{})
   551  	)
   552  
   553  	wg.Add(1)
   554  	go func() {
   555  		defer wg.Done()
   556  
   557  		var (
   558  			delayCtx context.Context
   559  			cancel   context.CancelFunc
   560  		)
   561  
   562  		for {
   563  			select {
   564  			case <-ctx.Done():
   565  				if cancel != nil {
   566  					cancel()
   567  				}
   568  				return
   569  
   570  			case <-notify:
   571  				if cancel != nil {
   572  					cancel()
   573  				}
   574  
   575  				delayCtx, cancel = context.WithCancel(ctx)
   576  
   577  				wg.Add(1)
   578  				go func(ctx context.Context) {
   579  					defer wg.Done()
   580  
   581  					if w.delayInterval > 0 {
   582  						t := time.NewTicker(w.delayInterval)
   583  						defer t.Stop()
   584  
   585  						select {
   586  						case <-ctx.Done():
   587  							return
   588  						case <-t.C:
   589  						}
   590  					}
   591  
   592  					select {
   593  					case w.notify <- struct{}{}:
   594  					case <-ctx.Done():
   595  					}
   596  				}(delayCtx)
   597  			}
   598  		}
   599  	}()
   600  
   601  	for {
   602  		select {
   603  		case <-ctx.Done():
   604  			wg.Wait()
   605  			return
   606  
   607  		case event := <-w.w.Events:
   608  			w.watchEvents.Inc()
   609  			if _, ok := w.watchedDirs[filepath.Dir(event.Name)]; ok {
   610  				select {
   611  				case notify <- struct{}{}:
   612  				default:
   613  				}
   614  			}
   615  
   616  		case err := <-w.w.Errors:
   617  			w.watchErrors.Inc()
   618  			level.Error(w.logger).Log("msg", "watch error", "err", err)
   619  		}
   620  	}
   621  }