github.com/thanos-io/thanos@v0.32.5/pkg/reloader/reloader.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 // Package reloader contains helpers to trigger reloads of Prometheus instances 5 // on configuration changes and to substitute environment variables in config files. 6 // 7 // Reloader type is useful when you want to: 8 // 9 // - Watch on changes against certain file e.g (`cfgFile`). 10 // - Optionally, specify different output file for watched `cfgFile` (`cfgOutputFile`). 11 // This will also try decompress the `cfgFile` if needed and substitute ALL the envvars using Kubernetes substitution format: (`$(var)`) 12 // - Watch on changes against certain directories (`watchedDirs`). 13 // 14 // Once any of those two changes, Prometheus on given `reloadURL` will be notified, causing Prometheus to reload configuration and rules. 15 // 16 // This and below for reloader: 17 // 18 // u, _ := url.Parse("http://localhost:9090") 19 // rl := reloader.New(nil, nil, &reloader.Options{ 20 // ReloadURL: reloader.ReloadURLFromBase(u), 21 // CfgFile: "/path/to/cfg", 22 // CfgOutputFile: "/path/to/cfg.out", 23 // WatchedDirs: []string{"/path/to/dirs"}, 24 // WatchInterval: 3 * time.Minute, 25 // RetryInterval: 5 * time.Second, 26 // }) 27 // 28 // The url of reloads can be generated with function ReloadURLFromBase(). 29 // It will append the default path of reload into the given url: 30 // 31 // u, _ := url.Parse("http://localhost:9090") 32 // reloader.ReloadURLFromBase(u) // It will return "http://localhost:9090/-/reload" 33 // 34 // Start watching changes and stopped until the context gets canceled: 35 // 36 // ctx, cancel := context.WithCancel(context.Background()) 37 // go func() { 38 // if err := rl.Watch(ctx); err != nil { 39 // log.Fatal(err) 40 // } 41 // }() 42 // // ... 43 // cancel() 44 // 45 // Reloader will make a schedule to check the given config files and dirs of sum of hash with the last result, 46 // even if it is no changes. 47 // 48 // A basic example of configuration template with environment variables: 49 // 50 // global: 51 // external_labels: 52 // replica: '$(HOSTNAME)' 53 package reloader 54 55 import ( 56 "bytes" 57 "compress/gzip" 58 "context" 59 "hash" 60 "io" 61 "net/http" 62 "net/url" 63 "os" 64 "path" 65 "path/filepath" 66 "regexp" 67 "strings" 68 "sync" 69 "time" 70 71 "github.com/fsnotify/fsnotify" 72 "github.com/go-kit/log" 73 "github.com/go-kit/log/level" 74 "github.com/minio/sha256-simd" 75 "github.com/pkg/errors" 76 "github.com/prometheus/client_golang/prometheus" 77 "github.com/prometheus/client_golang/prometheus/promauto" 78 79 "github.com/thanos-io/thanos/pkg/runutil" 80 ) 81 82 // Reloader can watch config files and trigger reloads of a Prometheus server. 83 // It optionally substitutes environment variables in the configuration. 84 // Referenced environment variables must be of the form `$(var)` (not `$var` or `${var}`). 85 type Reloader struct { 86 logger log.Logger 87 reloadURL *url.URL 88 httpClient http.Client 89 cfgFile string 90 cfgOutputFile string 91 watchInterval time.Duration 92 retryInterval time.Duration 93 watchedDirs []string 94 watcher *watcher 95 96 lastCfgHash []byte 97 lastWatchedDirsHash []byte 98 forceReload bool 99 100 reloads prometheus.Counter 101 reloadErrors prometheus.Counter 102 lastReloadSuccess prometheus.Gauge 103 lastReloadSuccessTimestamp prometheus.Gauge 104 configApplyErrors prometheus.Counter 105 configApply prometheus.Counter 106 } 107 108 // Options bundles options for the Reloader. 109 type Options struct { 110 // ReloadURL is a prometheus URL to trigger reloads. 111 ReloadURL *url.URL 112 // CfgFile is a path to the prometheus config file to watch. 113 CfgFile string 114 // CfgOutputFile is a path for the output config file. 115 // If cfgOutputFile is not empty the config file will be decompressed if needed, environment variables 116 // will be substituted and the output written into the given path. Prometheus should then use 117 // cfgOutputFile as its config file path. 118 CfgOutputFile string 119 // WatchedDirs is a collection of paths for the reloader to watch over. 120 WatchedDirs []string 121 // DelayInterval controls how long the reloader will wait without receiving 122 // new file-system events before it applies the reload. 123 DelayInterval time.Duration 124 // WatchInterval controls how often reloader re-reads config and directories. 125 WatchInterval time.Duration 126 // RetryInterval controls how often the reloader retries a reloading of the 127 // configuration in case the endpoint returned an error. 128 RetryInterval time.Duration 129 } 130 131 var firstGzipBytes = []byte{0x1f, 0x8b, 0x08} 132 133 // New creates a new reloader that watches the given config file and directories 134 // and triggers a Prometheus reload upon changes. 135 func New(logger log.Logger, reg prometheus.Registerer, o *Options) *Reloader { 136 if logger == nil { 137 logger = log.NewNopLogger() 138 } 139 r := &Reloader{ 140 logger: logger, 141 reloadURL: o.ReloadURL, 142 cfgFile: o.CfgFile, 143 cfgOutputFile: o.CfgOutputFile, 144 watcher: newWatcher(logger, reg, o.DelayInterval), 145 watchedDirs: o.WatchedDirs, 146 watchInterval: o.WatchInterval, 147 retryInterval: o.RetryInterval, 148 149 reloads: promauto.With(reg).NewCounter( 150 prometheus.CounterOpts{ 151 Name: "reloader_reloads_total", 152 Help: "Total number of reload requests.", 153 }, 154 ), 155 reloadErrors: promauto.With(reg).NewCounter( 156 prometheus.CounterOpts{ 157 Name: "reloader_reloads_failed_total", 158 Help: "Total number of reload requests that failed.", 159 }, 160 ), 161 lastReloadSuccess: promauto.With(reg).NewGauge( 162 prometheus.GaugeOpts{ 163 Name: "reloader_last_reload_successful", 164 Help: "Whether the last reload attempt was successful", 165 }, 166 ), 167 lastReloadSuccessTimestamp: promauto.With(reg).NewGauge( 168 prometheus.GaugeOpts{ 169 Name: "reloader_last_reload_success_timestamp_seconds", 170 Help: "Timestamp of the last successful reload", 171 }, 172 ), 173 configApply: promauto.With(reg).NewCounter( 174 prometheus.CounterOpts{ 175 Name: "reloader_config_apply_operations_total", 176 Help: "Total number of config apply operations.", 177 }, 178 ), 179 configApplyErrors: promauto.With(reg).NewCounter( 180 prometheus.CounterOpts{ 181 Name: "reloader_config_apply_operations_failed_total", 182 Help: "Total number of config apply operations that failed.", 183 }, 184 ), 185 } 186 return r 187 } 188 189 // Watch detects any change made to the watched config file and directories. It 190 // returns when the context is canceled. 191 // Whenever a filesystem change is detected or the watch interval has elapsed, 192 // the reloader expands the config file (if cfgOutputFile is specified) and 193 // triggers a reload if the configuration file or files in the watched 194 // directories have changed. 195 // Because some edge cases might be missing, the reloader also relies on the 196 // watch interval. 197 func (r *Reloader) Watch(ctx context.Context) error { 198 if r.cfgFile == "" && len(r.watchedDirs) == 0 { 199 level.Info(r.logger).Log("msg", "nothing to be watched") 200 <-ctx.Done() 201 return nil 202 } 203 204 defer runutil.CloseWithLogOnErr(r.logger, r.watcher, "config watcher close") 205 206 if r.cfgFile != "" { 207 if err := r.watcher.addFile(r.cfgFile); err != nil { 208 return errors.Wrapf(err, "add config file %s to watcher", r.cfgFile) 209 } 210 initialSyncCtx, initialSyncCancel := context.WithTimeout(ctx, r.watchInterval) 211 err := r.apply(initialSyncCtx) 212 initialSyncCancel() 213 if err != nil { 214 return err 215 } 216 } 217 218 if r.watchInterval == 0 { 219 // Skip watching the file-system. 220 return nil 221 } 222 223 for _, dir := range r.watchedDirs { 224 if err := r.watcher.addDirectory(dir); err != nil { 225 return errors.Wrapf(err, "add directory %s to watcher", dir) 226 } 227 } 228 229 // Start watching the file-system. 230 var wg sync.WaitGroup 231 wg.Add(1) 232 go func() { 233 r.watcher.run(ctx) 234 wg.Done() 235 }() 236 237 level.Info(r.logger).Log( 238 "msg", "started watching config file and directories for changes", 239 "cfg", r.cfgFile, 240 "out", r.cfgOutputFile, 241 "dirs", strings.Join(r.watchedDirs, ",")) 242 243 applyCtx, applyCancel := context.WithTimeout(ctx, r.watchInterval) 244 245 for { 246 select { 247 case <-applyCtx.Done(): 248 if ctx.Err() != nil { 249 applyCancel() 250 wg.Wait() 251 return nil 252 } 253 case <-r.watcher.notify: 254 } 255 256 // Reset the watch timeout. 257 applyCancel() 258 applyCtx, applyCancel = context.WithTimeout(ctx, r.watchInterval) 259 260 r.configApply.Inc() 261 if err := r.apply(applyCtx); err != nil { 262 r.configApplyErrors.Inc() 263 level.Error(r.logger).Log("msg", "apply error", "err", err) 264 continue 265 } 266 } 267 } 268 269 // apply triggers Prometheus reload if rules or config changed. If cfgOutputFile is set, we also 270 // expand env vars into config file before reloading. 271 // Reload is retried in retryInterval until watchInterval. 272 func (r *Reloader) apply(ctx context.Context) error { 273 var ( 274 cfgHash []byte 275 watchedDirsHash []byte 276 ) 277 if r.cfgFile != "" { 278 h := sha256.New() 279 if err := hashFile(h, r.cfgFile); err != nil { 280 return errors.Wrap(err, "hash file") 281 } 282 cfgHash = h.Sum(nil) 283 if r.cfgOutputFile != "" { 284 b, err := os.ReadFile(r.cfgFile) 285 if err != nil { 286 return errors.Wrap(err, "read file") 287 } 288 289 // Detect and extract gzipped file. 290 if bytes.Equal(b[0:3], firstGzipBytes) { 291 zr, err := gzip.NewReader(bytes.NewReader(b)) 292 if err != nil { 293 return errors.Wrap(err, "create gzip reader") 294 } 295 defer runutil.CloseWithLogOnErr(r.logger, zr, "gzip reader close") 296 297 b, err = io.ReadAll(zr) 298 if err != nil { 299 return errors.Wrap(err, "read compressed config file") 300 } 301 } 302 303 b, err = expandEnv(b) 304 if err != nil { 305 return errors.Wrap(err, "expand environment variables") 306 } 307 308 tmpFile := r.cfgOutputFile + ".tmp" 309 defer func() { 310 _ = os.Remove(tmpFile) 311 }() 312 if err := os.WriteFile(tmpFile, b, 0644); err != nil { 313 return errors.Wrap(err, "write file") 314 } 315 if err := os.Rename(tmpFile, r.cfgOutputFile); err != nil { 316 return errors.Wrap(err, "rename file") 317 } 318 } 319 } 320 321 h := sha256.New() 322 for _, dir := range r.watchedDirs { 323 walkDir, err := filepath.EvalSymlinks(dir) 324 if err != nil { 325 return errors.Wrap(err, "dir symlink eval") 326 } 327 err = filepath.Walk(walkDir, func(path string, f os.FileInfo, err error) error { 328 if err != nil { 329 return err 330 } 331 332 // filepath.Walk uses Lstat to retrieve os.FileInfo. Lstat does not 333 // follow symlinks. Make sure to follow a symlink before checking 334 // if it is a directory. 335 targetFile, err := os.Stat(path) 336 if err != nil { 337 return err 338 } 339 340 if targetFile.IsDir() { 341 return nil 342 } 343 344 if err := hashFile(h, path); err != nil { 345 return err 346 } 347 return nil 348 }) 349 if err != nil { 350 return errors.Wrap(err, "build hash") 351 } 352 } 353 if len(r.watchedDirs) > 0 { 354 watchedDirsHash = h.Sum(nil) 355 } 356 357 if !r.forceReload && bytes.Equal(r.lastCfgHash, cfgHash) && bytes.Equal(r.lastWatchedDirsHash, watchedDirsHash) { 358 // Nothing to do. 359 return nil 360 } 361 362 if err := runutil.RetryWithLog(r.logger, r.retryInterval, ctx.Done(), func() error { 363 if r.watchInterval == 0 { 364 return nil 365 } 366 r.reloads.Inc() 367 if err := r.triggerReload(ctx); err != nil { 368 r.reloadErrors.Inc() 369 r.lastReloadSuccess.Set(0) 370 return errors.Wrap(err, "trigger reload") 371 } 372 373 r.forceReload = false 374 r.lastCfgHash = cfgHash 375 r.lastWatchedDirsHash = watchedDirsHash 376 level.Info(r.logger).Log( 377 "msg", "Reload triggered", 378 "cfg_in", r.cfgFile, 379 "cfg_out", r.cfgOutputFile, 380 "watched_dirs", strings.Join(r.watchedDirs, ", ")) 381 r.lastReloadSuccess.Set(1) 382 r.lastReloadSuccessTimestamp.SetToCurrentTime() 383 return nil 384 }); err != nil { 385 r.forceReload = true 386 level.Error(r.logger).Log("msg", "Failed to trigger reload. Retrying.", "err", err) 387 } 388 389 return nil 390 } 391 392 func hashFile(h hash.Hash, fn string) error { 393 f, err := os.Open(filepath.Clean(fn)) 394 if err != nil { 395 return err 396 } 397 defer runutil.CloseWithErrCapture(&err, f, "close file") 398 399 if _, err := h.Write([]byte{'\xff'}); err != nil { 400 return err 401 } 402 if _, err := h.Write([]byte(fn)); err != nil { 403 return err 404 } 405 if _, err := h.Write([]byte{'\xff'}); err != nil { 406 return err 407 } 408 409 if _, err := io.Copy(h, f); err != nil { 410 return err 411 } 412 return nil 413 } 414 415 func (r *Reloader) triggerReload(ctx context.Context) error { 416 req, err := http.NewRequest("POST", r.reloadURL.String(), nil) 417 if err != nil { 418 return errors.Wrap(err, "create request") 419 } 420 req = req.WithContext(ctx) 421 422 resp, err := r.httpClient.Do(req) 423 if err != nil { 424 return errors.Wrap(err, "reload request failed") 425 } 426 defer runutil.ExhaustCloseWithLogOnErr(r.logger, resp.Body, "trigger reload resp body") 427 428 if resp.StatusCode != 200 { 429 return errors.Errorf("received non-200 response: %s; have you set `--web.enable-lifecycle` Prometheus flag?", resp.Status) 430 } 431 return nil 432 } 433 434 // SetHttpClient sets Http client for reloader. 435 func (r *Reloader) SetHttpClient(client http.Client) { 436 r.httpClient = client 437 } 438 439 // ReloadURLFromBase returns the standard Prometheus reload URL from its base URL. 440 func ReloadURLFromBase(u *url.URL) *url.URL { 441 r := *u 442 r.Path = path.Join(r.Path, "/-/reload") 443 return &r 444 } 445 446 var envRe = regexp.MustCompile(`\$\(([a-zA-Z_0-9]+)\)`) 447 448 func expandEnv(b []byte) (r []byte, err error) { 449 r = envRe.ReplaceAllFunc(b, func(n []byte) []byte { 450 if err != nil { 451 return nil 452 } 453 n = n[2 : len(n)-1] 454 455 v, ok := os.LookupEnv(string(n)) 456 if !ok { 457 err = errors.Errorf("found reference to unset environment variable %q", n) 458 return nil 459 } 460 return []byte(v) 461 }) 462 return r, err 463 } 464 465 type watcher struct { 466 notify chan struct{} 467 468 w *fsnotify.Watcher 469 watchedDirs map[string]struct{} 470 delayInterval time.Duration 471 472 logger log.Logger 473 watchedItems prometheus.Gauge 474 watchEvents prometheus.Counter 475 watchErrors prometheus.Counter 476 } 477 478 func newWatcher(logger log.Logger, reg prometheus.Registerer, delayInterval time.Duration) *watcher { 479 return &watcher{ 480 logger: logger, 481 delayInterval: delayInterval, 482 notify: make(chan struct{}), 483 watchedDirs: make(map[string]struct{}), 484 485 watchedItems: promauto.With(reg).NewGauge( 486 prometheus.GaugeOpts{ 487 Name: "reloader_watches", 488 Help: "Number of resources watched by the reloader.", 489 }, 490 ), 491 watchEvents: promauto.With(reg).NewCounter( 492 prometheus.CounterOpts{ 493 Name: "reloader_watch_events_total", 494 Help: "Total number of events received by the reloader from the watcher.", 495 }, 496 ), 497 watchErrors: promauto.With(reg).NewCounter( 498 prometheus.CounterOpts{ 499 Name: "reloader_watch_errors_total", 500 Help: "Total number of errors received by the reloader from the watcher.", 501 }, 502 ), 503 } 504 } 505 506 // Close implements the io.Closer interface. 507 func (w *watcher) Close() error { 508 if w.w == nil { 509 return nil 510 } 511 watcher := w.w 512 w.w = nil 513 return watcher.Close() 514 } 515 516 func (w *watcher) addPath(name string) error { 517 if w.w == nil { 518 fsWatcher, err := fsnotify.NewWatcher() 519 if err != nil { 520 return errors.Wrap(err, "create watcher") 521 } 522 w.w = fsWatcher 523 } 524 525 if err := w.w.Add(name); err != nil { 526 return err 527 } 528 529 w.watchedDirs[name] = struct{}{} 530 w.watchedItems.Set(float64(len(w.watchedDirs))) 531 532 return nil 533 } 534 535 func (w *watcher) addDirectory(name string) error { 536 w.watchedDirs[name] = struct{}{} 537 return w.addPath(name) 538 } 539 540 func (w *watcher) addFile(name string) error { 541 w.watchedDirs[filepath.Dir(name)] = struct{}{} 542 return w.addPath(name) 543 } 544 545 func (w *watcher) run(ctx context.Context) { 546 defer runutil.CloseWithLogOnErr(w.logger, w.w, "config watcher close") 547 548 var ( 549 wg sync.WaitGroup 550 notify = make(chan struct{}) 551 ) 552 553 wg.Add(1) 554 go func() { 555 defer wg.Done() 556 557 var ( 558 delayCtx context.Context 559 cancel context.CancelFunc 560 ) 561 562 for { 563 select { 564 case <-ctx.Done(): 565 if cancel != nil { 566 cancel() 567 } 568 return 569 570 case <-notify: 571 if cancel != nil { 572 cancel() 573 } 574 575 delayCtx, cancel = context.WithCancel(ctx) 576 577 wg.Add(1) 578 go func(ctx context.Context) { 579 defer wg.Done() 580 581 if w.delayInterval > 0 { 582 t := time.NewTicker(w.delayInterval) 583 defer t.Stop() 584 585 select { 586 case <-ctx.Done(): 587 return 588 case <-t.C: 589 } 590 } 591 592 select { 593 case w.notify <- struct{}{}: 594 case <-ctx.Done(): 595 } 596 }(delayCtx) 597 } 598 } 599 }() 600 601 for { 602 select { 603 case <-ctx.Done(): 604 wg.Wait() 605 return 606 607 case event := <-w.w.Events: 608 w.watchEvents.Inc() 609 if _, ok := w.watchedDirs[filepath.Dir(event.Name)]; ok { 610 select { 611 case notify <- struct{}{}: 612 default: 613 } 614 } 615 616 case err := <-w.w.Errors: 617 w.watchErrors.Inc() 618 level.Error(w.logger).Log("msg", "watch error", "err", err) 619 } 620 } 621 }