github.com/pelicanplatform/pelican@v1.0.5/web_ui/prometheus.go (about) 1 // Copyright 2015 The Prometheus Authors 2 // Licensed under the Apache License, Version 2.0 (the "License"); 3 // you may not use this file except in compliance with the License. 4 // You may obtain a copy of the License at 5 // 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 // This package started as a fork of the `prometheus` CLI executable and was 15 // heavily adapted to make it embedded into the pelican web UI. 16 package web_ui 17 18 import ( 19 "context" 20 "fmt" 21 "math" 22 "net/http" 23 "net/url" 24 "os" 25 "os/signal" 26 "path/filepath" 27 "strings" 28 "sync" 29 "syscall" 30 "time" 31 32 "github.com/alecthomas/units" 33 "github.com/gin-gonic/gin" 34 "github.com/go-kit/log" 35 "github.com/go-kit/log/level" 36 "github.com/grafana/regexp" 37 "github.com/mwitkow/go-conntrack" 38 "github.com/oklog/run" 39 "github.com/pelicanplatform/pelican/director" 40 "github.com/pelicanplatform/pelican/param" 41 "github.com/pelicanplatform/pelican/utils" 42 "github.com/pkg/errors" 43 "github.com/prometheus/client_golang/prometheus" 44 "github.com/prometheus/common/model" 45 "github.com/prometheus/common/version" 46 "github.com/sirupsen/logrus" 47 "go.uber.org/atomic" 48 49 common_config "github.com/prometheus/common/config" 50 "github.com/prometheus/common/route" 51 "github.com/prometheus/prometheus/config" 52 "github.com/prometheus/prometheus/discovery" 53 prom_http "github.com/prometheus/prometheus/discovery/http" 54 "github.com/prometheus/prometheus/discovery/targetgroup" 55 "github.com/prometheus/prometheus/model/exemplar" 56 "github.com/prometheus/prometheus/model/histogram" 57 "github.com/prometheus/prometheus/model/labels" 58 "github.com/prometheus/prometheus/model/metadata" 59 "github.com/prometheus/prometheus/model/relabel" 60 "github.com/prometheus/prometheus/promql" 61 "github.com/prometheus/prometheus/scrape" 62 "github.com/prometheus/prometheus/storage" 63 64 //"github.com/prometheus/prometheus/storage/remote" 65 "github.com/prometheus/prometheus/tsdb" 66 "github.com/prometheus/prometheus/tsdb/agent" 67 "github.com/prometheus/prometheus/tsdb/wlog" 68 "github.com/prometheus/prometheus/web" 69 api_v1 "github.com/prometheus/prometheus/web/api/v1" 70 ) 71 72 var ( 73 appName = "prometheus" 74 75 defaultRetentionString = "15d" 76 defaultRetentionDuration model.Duration 77 78 globalConfig config.Config 79 globalConfigMtx sync.RWMutex 80 ) 81 82 func init() { 83 prometheus.MustRegister(version.NewCollector(strings.ReplaceAll(appName, "-", "_"))) 84 85 var err error 86 defaultRetentionDuration, err = model.ParseDuration(defaultRetentionString) 87 if err != nil { 88 panic(err) 89 } 90 } 91 92 type flagConfig struct { 93 serverStoragePath string 94 forGracePeriod model.Duration 95 outageTolerance model.Duration 96 resendDelay model.Duration 97 scrape scrape.Options 98 tsdb tsdbOptions 99 lookbackDelta model.Duration 100 webTimeout model.Duration 101 queryTimeout model.Duration 102 queryConcurrency int 103 queryMaxSamples int 104 RemoteFlushDeadline model.Duration 105 106 enableExpandExternalLabels bool 107 enablePerStepStats bool 108 } 109 110 type ReadyHandler struct { 111 ready atomic.Uint32 112 } 113 114 type LogrusAdapter struct { 115 *logrus.Logger 116 defaultFields logrus.Fields 117 } 118 119 func (h *ReadyHandler) SetReady(v bool) { 120 if v { 121 h.ready.Store(1) 122 return 123 } 124 125 h.ready.Store(0) 126 } 127 128 func (h *ReadyHandler) isReady() bool { 129 return h.ready.Load() > 0 130 } 131 132 func (h *ReadyHandler) testReady(f http.HandlerFunc) http.HandlerFunc { 133 return func(w http.ResponseWriter, r *http.Request) { 134 if h.isReady() { 135 f(w, r) 136 } else { 137 w.WriteHeader(http.StatusServiceUnavailable) 138 fmt.Fprintf(w, "Service Unavailable") 139 } 140 } 141 } 142 143 func runtimeInfo() (api_v1.RuntimeInfo, error) { 144 return api_v1.RuntimeInfo{}, nil 145 } 146 147 // Configure director's Prometheus scraper to use HTTP service discovery for origins 148 func configDirectorPromScraper() (*config.ScrapeConfig, error) { 149 originDiscoveryUrl, err := url.Parse(param.Server_ExternalWebUrl.GetString()) 150 if err != nil { 151 return nil, fmt.Errorf("parse external URL %v: %w", param.Server_ExternalWebUrl.GetString(), err) 152 } 153 sdToken, err := director.CreateDirectorSDToken() 154 if err != nil { 155 return nil, fmt.Errorf("Failed to generate token for Prometheus service discovery at start: %v", err) 156 } 157 scraperToken, err := director.CreateDirectorScrapeToken() 158 if err != nil { 159 return nil, fmt.Errorf("Failed to generate token for director scraper at start: %v", err) 160 } 161 originDiscoveryUrl.Path = "/api/v1.0/director/discoverOrigins" 162 scrapeConfig := config.DefaultScrapeConfig 163 scrapeConfig.JobName = "origins" 164 scrapeConfig.Scheme = "https" 165 166 // This will cause the director to maintain a CA bundle, including the custom CA, at 167 // the given location. Makes up for the fact we can't provide Prometheus with a transport 168 caBundle := filepath.Join(param.Monitoring_DataLocation.GetString(), "ca-bundle.crt") 169 caCount, err := utils.PeriodicWriteCABundle(caBundle, 2*time.Minute) 170 if err != nil { 171 return nil, errors.Wrap(err, "Unable to generate CA bundle for prometheus") 172 } 173 174 scraperHttpClientConfig := common_config.HTTPClientConfig{ 175 TLSConfig: common_config.TLSConfig{ 176 // For the scraper to origins' metrics, we get TLSSkipVerify from config 177 // As this request is to external address 178 InsecureSkipVerify: param.TLSSkipVerify.GetBool(), 179 }, 180 // We add token auth for scraping all origin/cache servers 181 Authorization: &common_config.Authorization{ 182 Type: "Bearer", 183 Credentials: common_config.Secret(scraperToken), 184 }, 185 } 186 if caCount > 0 { 187 scraperHttpClientConfig.TLSConfig.CAFile = caBundle 188 } 189 190 scrapeConfig.HTTPClientConfig = scraperHttpClientConfig 191 scrapeConfig.ServiceDiscoveryConfigs = make([]discovery.Config, 1) 192 sdHttpClientConfig := common_config.HTTPClientConfig{ 193 TLSConfig: common_config.TLSConfig{ 194 // Service discovery is internal only to the director, so there's 195 // no need to enforce TLS check 196 InsecureSkipVerify: true, 197 }, 198 Authorization: &common_config.Authorization{ 199 Type: "Bearer", 200 Credentials: common_config.Secret(sdToken), 201 }, 202 } 203 scrapeConfig.ServiceDiscoveryConfigs[0] = &prom_http.SDConfig{ 204 URL: originDiscoveryUrl.String(), 205 RefreshInterval: model.Duration(15 * time.Second), 206 HTTPClientConfig: sdHttpClientConfig, 207 } 208 return &scrapeConfig, nil 209 } 210 211 // Log method which satisfies the kitlog.Logger interface. 212 // It also propragates field level and field message to top level log 213 func (a LogrusAdapter) Log(keyvals ...interface{}) error { 214 // Extract the log level and message from the keyvals. 215 logLevel := logrus.InfoLevel 216 msg := "" 217 fields := make(logrus.Fields) 218 for k, v := range a.defaultFields { 219 fields[k] = v 220 } 221 222 for i := 0; i < len(keyvals); i += 2 { 223 if key, ok := keyvals[i].(string); ok { 224 if val := keyvals[i+1]; key == "level" { 225 // Parse the log level. 226 var err error 227 logval, ok := val.(level.Value) 228 if !ok { 229 a.Logger.Error("log: can't log level value") 230 return err 231 } 232 logLevel, err = logrus.ParseLevel(logval.String()) 233 if err != nil { 234 a.Logger.Error("log: invalid log level message") 235 return err 236 } 237 } else if key == "msg" { 238 msg, ok = val.(string) 239 if !ok { 240 a.Logger.Error("log: invalid log message") 241 return errors.New("log: invalid log message") 242 } 243 } else if key == "err" { 244 logErr, ok := val.(error) 245 if !ok { 246 a.Logger.Error("log: invalid error log message") 247 return errors.New("log: invalid error log message") 248 } 249 msg = logErr.Error() 250 } else { 251 fields[key] = val 252 } 253 } 254 } 255 256 // Set the log level and log the message with the fields. 257 entry := a.WithFields(fields) 258 switch logLevel { 259 case logrus.WarnLevel: 260 entry.Warn(msg) 261 case logrus.ErrorLevel: 262 entry.Error(msg) 263 case logrus.InfoLevel: 264 entry.Info(msg) 265 case logrus.DebugLevel: 266 entry.Debug(msg) 267 default: 268 entry.Info(msg) // Default to info level if not specified. 269 } 270 271 return nil 272 } 273 274 func ConfigureEmbeddedPrometheus(engine *gin.Engine, isDirector bool) error { 275 cfg := flagConfig{} 276 ListenAddress := fmt.Sprintf("0.0.0.0:%v", param.Server_WebPort.GetInt()) 277 cfg.webTimeout = model.Duration(5 * time.Minute) 278 cfg.serverStoragePath = param.Monitoring_DataLocation.GetString() 279 280 // The code below is for testing director Prometheus scraping locally 281 // Uncomment only if you know what you are doing 282 283 // if isDirector { 284 // err := os.MkdirAll("/var/lib/pelican/director-monitoring/data", 0750) 285 // if err != nil { 286 // return errors.New("Failure when creating a directory for the monitoring data") 287 // } 288 // cfg.serverStoragePath = "/var/lib/pelican/director-monitoring/data" 289 // } else { 290 // cfg.serverStoragePath = param.Monitoring_DataLocation.GetString() 291 // } 292 cfg.tsdb.MinBlockDuration = model.Duration(2 * time.Hour) 293 cfg.tsdb.NoLockfile = false 294 cfg.tsdb.WALCompression = true 295 cfg.tsdb.HeadChunksWriteQueueSize = 0 296 cfg.tsdb.SamplesPerChunk = 120 297 cfg.RemoteFlushDeadline = model.Duration(1 * time.Minute) 298 cfg.outageTolerance = model.Duration(1 * time.Hour) 299 cfg.forGracePeriod = model.Duration(10 * time.Minute) 300 cfg.resendDelay = model.Duration(1 * time.Minute) 301 cfg.lookbackDelta = model.Duration(5 * time.Minute) 302 cfg.queryTimeout = model.Duration(2 * time.Minute) 303 cfg.queryConcurrency = 20 304 cfg.queryMaxSamples = 50000000 305 cfg.scrape.DiscoveryReloadInterval = model.Duration(5 * time.Second) 306 307 RemoteReadSampleLimit := int(5e7) 308 RemoteReadConcurrencyLimit := 10 309 RemoteReadBytesInFrame := 1048576 310 311 scrape.AlignScrapeTimestamps = true 312 scrape.ScrapeTimestampTolerance = 2 * time.Millisecond 313 314 logrusLogger := logrus.WithFields(logrus.Fields{"component": "prometheus"}) 315 316 // Create a Go kit logger that wraps the logrus logger. 317 logger := LogrusAdapter{Logger: logrusLogger.Logger, defaultFields: logrusLogger.Data} 318 319 localStoragePath := cfg.serverStoragePath 320 321 external_url, err := url.Parse(param.Server_ExternalWebUrl.GetString()) 322 if err != nil { 323 return fmt.Errorf("parse external URL %v: %w", param.Server_ExternalWebUrl.GetString(), err) 324 } 325 326 CORSOrigin, err := compileCORSRegexString(".*") 327 if err != nil { 328 panic(err) 329 } 330 331 // Throw error for invalid config before starting other components. 332 promCfg := config.Config{ 333 GlobalConfig: config.DefaultGlobalConfig, 334 ScrapeConfigs: make([]*config.ScrapeConfig, 1), 335 } 336 337 selfScraperToken, err := createPromMetricToken() 338 if err != nil { 339 return fmt.Errorf("Failed to generate token for self-scraper at start: %v", err) 340 } 341 342 scrapeConfig := config.DefaultScrapeConfig 343 scrapeConfig.JobName = "prometheus" 344 scrapeConfig.Scheme = "https" 345 scraperHttpClientConfig := common_config.HTTPClientConfig{ 346 TLSConfig: common_config.TLSConfig{ 347 // This is the self-scrape, so no need to enforce the TLS check 348 InsecureSkipVerify: true, 349 }, 350 // We add token auth for scraping all origin/cache servers 351 Authorization: &common_config.Authorization{ 352 Type: "Bearer", 353 Credentials: common_config.Secret(selfScraperToken), 354 }, 355 } 356 scrapeConfig.HTTPClientConfig = scraperHttpClientConfig 357 scrapeConfig.ServiceDiscoveryConfigs = make([]discovery.Config, 1) 358 // model.AddressLabel needs a hostname (w/ port), so we cut the protocol here 359 externalAddressWoProtocol, _ := strings.CutPrefix(param.Server_ExternalWebUrl.GetString(), "https://") 360 scrapeConfig.ServiceDiscoveryConfigs[0] = discovery.StaticConfig{ 361 &targetgroup.Group{ 362 Targets: []model.LabelSet{{ 363 model.AddressLabel: model.LabelValue(externalAddressWoProtocol), 364 }}, 365 }, 366 } 367 promCfg.ScrapeConfigs[0] = &scrapeConfig 368 369 // Add origins monitoring to director's prometheus instance 370 if isDirector { 371 dirPromScraperConfig, err := configDirectorPromScraper() 372 if err != nil { 373 return err 374 } 375 promCfg.ScrapeConfigs = append(promCfg.ScrapeConfigs, dirPromScraperConfig) 376 } 377 378 promCfg.GlobalConfig.ScrapeInterval = model.Duration(15 * time.Second) 379 380 if promCfg.StorageConfig.TSDBConfig != nil { 381 cfg.tsdb.OutOfOrderTimeWindow = promCfg.StorageConfig.TSDBConfig.OutOfOrderTimeWindow 382 } 383 384 cfg.tsdb.RetentionDuration = defaultRetentionDuration 385 386 // Max block size settings. 387 if cfg.tsdb.MaxBlockDuration == 0 { 388 maxBlockDuration, err := model.ParseDuration("31d") 389 if err != nil { 390 panic(err) 391 } 392 // When the time retention is set and not too big use to define the max block duration. 393 if cfg.tsdb.RetentionDuration != 0 && cfg.tsdb.RetentionDuration/10 < maxBlockDuration { 394 maxBlockDuration = cfg.tsdb.RetentionDuration / 10 395 } 396 397 cfg.tsdb.MaxBlockDuration = maxBlockDuration 398 } 399 400 noStepSubqueryInterval := &safePromQLNoStepSubqueryInterval{} 401 noStepSubqueryInterval.Set(config.DefaultGlobalConfig.EvaluationInterval) 402 403 var ( 404 localStorage = &readyStorage{stats: tsdb.NewDBStats()} 405 scraper = &readyScrapeManager{} 406 //remoteStorage = remote.NewStorage(log.With(logger, "component", "remote"), prometheus.DefaultRegisterer, localStorage.StartTime, localStoragePath, time.Duration(cfg.RemoteFlushDeadline), scraper) 407 //fanoutStorage = storage.NewFanout(logger, localStorage, remoteStorage) 408 fanoutStorage = storage.NewFanout(logger, localStorage) 409 ) 410 411 var ( 412 //ctxWeb, cancelWeb = context.WithCancel(context.Background()) 413 //ctxRule = context.Background() 414 415 ctxScrape, cancelScrape = context.WithCancel(context.Background()) 416 discoveryManagerScrape discoveryManager 417 ) 418 419 discovery.RegisterMetrics() 420 discoveryManagerScrape = discovery.NewManager(ctxScrape, log.With(logger, "component", "discovery manager scrape"), discovery.Name("scrape")) 421 422 var ( 423 scrapeManager = scrape.NewManager(&cfg.scrape, log.With(logger, "component", "scrape manager"), fanoutStorage) 424 425 queryEngine *promql.Engine 426 ) 427 428 { 429 opts := promql.EngineOpts{ 430 Logger: log.With(logger, "component", "query engine"), 431 Reg: prometheus.DefaultRegisterer, 432 MaxSamples: cfg.queryMaxSamples, 433 Timeout: time.Duration(cfg.queryTimeout), 434 ActiveQueryTracker: promql.NewActiveQueryTracker(localStoragePath, cfg.queryConcurrency, log.With(logger, "component", "activeQueryTracker")), 435 LookbackDelta: time.Duration(cfg.lookbackDelta), 436 NoStepSubqueryIntervalFn: noStepSubqueryInterval.Get, 437 // EnableAtModifier and EnableNegativeOffset have to be 438 // always on for regular PromQL as of Prometheus v2.33. 439 EnableAtModifier: true, 440 EnableNegativeOffset: true, 441 EnablePerStepStats: cfg.enablePerStepStats, 442 } 443 444 queryEngine = promql.NewEngine(opts) 445 446 } 447 scraper.Set(scrapeManager) 448 449 TSDBDir := localStoragePath 450 451 Version := &web.PrometheusVersion{ 452 Version: version.Version, 453 Revision: version.Revision, 454 Branch: version.Branch, 455 BuildUser: version.BuildUser, 456 BuildDate: version.BuildDate, 457 GoVersion: version.GoVersion, 458 } 459 460 Flags := map[string]string{} 461 462 // Depends on cfg.web.ScrapeManager so needs to be after cfg.web.ScrapeManager = scrapeManager. 463 // webHandler := web.New(log.With(logger, "component", "web"), &cfg.web) 464 465 // Monitor outgoing connections on default transport with conntrack. 466 http.DefaultTransport.(*http.Transport).DialContext = conntrack.NewDialContextFunc( 467 conntrack.DialWithTracing(), 468 ) 469 470 factorySPr := func(_ context.Context) api_v1.ScrapePoolsRetriever { return scrapeManager } 471 factoryTr := func(_ context.Context) api_v1.TargetRetriever { return scrapeManager } 472 factoryAr := func(_ context.Context) api_v1.AlertmanagerRetriever { return nil } 473 FactoryRr := func(_ context.Context) api_v1.RulesRetriever { return nil } 474 475 readyHandler := ReadyHandler{} 476 readyHandler.SetReady(false) 477 478 var app storage.Appendable 479 apiV1 := api_v1.NewAPI( 480 queryEngine, 481 fanoutStorage, 482 app, 483 localStorage, 484 factorySPr, 485 factoryTr, 486 factoryAr, 487 func() config.Config { 488 globalConfigMtx.RLock() 489 defer globalConfigMtx.RUnlock() 490 return globalConfig 491 }, 492 Flags, 493 api_v1.GlobalURLOptions{ 494 ListenAddress: ListenAddress, 495 Host: external_url.Host, 496 Scheme: external_url.Scheme, 497 }, 498 readyHandler.testReady, 499 localStorage, 500 TSDBDir, 501 false, 502 logger, 503 FactoryRr, 504 RemoteReadSampleLimit, 505 RemoteReadConcurrencyLimit, 506 RemoteReadBytesInFrame, 507 false, 508 CORSOrigin, 509 runtimeInfo, 510 Version, 511 prometheus.DefaultGatherer, 512 prometheus.DefaultRegisterer, 513 nil, 514 ) 515 av1 := route.New().WithPrefix("/api/v1.0/prometheus") 516 //WithInstrumentation(h.metrics.instrumentHandlerWithPrefix("/api/v1")). 517 //WithInstrumentation(setPathWithPrefix("/api/v1")) 518 apiV1.Register(av1) 519 520 // TODO: Add authorization to director's PromQL endpoint once there's a 521 // way that user can be authenticated or we have a web UI for director 522 if !isDirector { 523 engine.GET("/api/v1.0/prometheus/*any", promQueryEngineAuthHandler(av1)) 524 } else { 525 engine.GET("/api/v1.0/prometheus/*any", func(ctx *gin.Context) { 526 av1.ServeHTTP(ctx.Writer, ctx.Request) 527 }) 528 } 529 530 reloaders := []reloader{ 531 { 532 name: "db_storage", 533 reloader: localStorage.ApplyConfig, 534 }, /* { 535 name: "web_handler", 536 reloader: webHandler.ApplyConfig, 537 },*/{ 538 name: "query_engine", 539 reloader: func(cfg *config.Config) error { 540 queryEngine.SetQueryLogger(nil) 541 return nil 542 }, 543 }, { 544 name: "scrape", 545 reloader: scrapeManager.ApplyConfig, 546 }, { 547 name: "scrape_sd", 548 reloader: func(cfg *config.Config) error { 549 c := make(map[string]discovery.Configs) 550 scfgs, err := cfg.GetScrapeConfigs() 551 if err != nil { 552 return err 553 } 554 for _, v := range scfgs { 555 c[v.JobName] = v.ServiceDiscoveryConfigs 556 } 557 return discoveryManagerScrape.ApplyConfig(c) 558 }, 559 }, 560 } 561 562 // Start all components while we wait for TSDB to open but only load 563 // initial config and mark ourselves as ready after it completed. 564 dbOpen := make(chan struct{}) 565 566 // sync.Once is used to make sure we can close the channel at different execution stages(SIGTERM or when the config is loaded). 567 type closeOnce struct { 568 C chan struct{} 569 once sync.Once 570 Close func() 571 } 572 // Wait until the server is ready to handle reloading. 573 reloadReady := &closeOnce{ 574 C: make(chan struct{}), 575 } 576 reloadReady.Close = func() { 577 reloadReady.once.Do(func() { 578 close(reloadReady.C) 579 }) 580 } 581 var g run.Group 582 { 583 // Termination handler. 584 term := make(chan os.Signal, 1) 585 signal.Notify(term, os.Interrupt, syscall.SIGTERM) 586 cancel := make(chan struct{}) 587 g.Add( 588 func() error { 589 // Don't forget to release the reloadReady channel so that waiting blocks can exit normally. 590 select { 591 case <-term: 592 err := level.Warn(logger).Log("msg", "Received SIGTERM, exiting gracefully...") 593 _ = err 594 reloadReady.Close() 595 //case <-webHandler.Quit(): 596 // level.Warn(logger).Log("msg", "Received termination request via web service, exiting gracefully...") 597 case <-cancel: 598 reloadReady.Close() 599 } 600 return nil 601 }, 602 func(err error) { 603 close(cancel) 604 //webHandler.SetReady(false) 605 readyHandler.SetReady(false) 606 }, 607 ) 608 } 609 { 610 // Scrape discovery manager. 611 g.Add( 612 func() error { 613 err := discoveryManagerScrape.Run() 614 err2 := level.Info(logger).Log("msg", "Scrape discovery manager stopped") 615 _ = err2 616 return err 617 }, 618 func(err error) { 619 err2 := level.Info(logger).Log("msg", "Stopping scrape discovery manager...") 620 _ = err2 621 cancelScrape() 622 }, 623 ) 624 } 625 { 626 // Periodic scraper config reload to refresh service discovery token 627 cancel := make(chan struct{}) 628 g.Add( 629 func() error { 630 refreshInterval := param.Monitoring_TokenRefreshInterval.GetDuration() 631 if refreshInterval <= 0 { 632 err := level.Warn(logger).Log("msg", "Refresh interval is non-positive value. Stop reloading.") 633 _ = err 634 return errors.New("Refresh interval is non-positive value. Stop reloading.") 635 } 636 ticker := time.NewTicker(refreshInterval) 637 for { 638 select { 639 case <-cancel: 640 ticker.Stop() 641 err1 := level.Info(logger).Log("msg", "Stopping scraper config periodic reload...") 642 _ = err1 643 return nil 644 case <-ticker.C: 645 // Create an anonymous function to always use defer for locks 646 err := func() error { 647 globalConfigMtx.Lock() 648 defer globalConfigMtx.Unlock() 649 // Create a new self-scrape token 650 selfScraperToken, err := createPromMetricToken() 651 if err != nil { 652 return fmt.Errorf("Failed to generate token for self-scraper at start: %v", err) 653 } 654 655 // We need a fresh ScrapeConfigs copy so that deepEqual can give us green light 656 // before reload the scrape config 657 tempConfig := config.Config{ 658 GlobalConfig: promCfg.GlobalConfig, 659 ScrapeConfigs: make([]*config.ScrapeConfig, 1), 660 } 661 662 if len(promCfg.ScrapeConfigs) < 1 { 663 return errors.New("Length of ScrapeConfigs is less than 1, abort reloading") 664 } 665 666 oldScrapeCfg := promCfg.ScrapeConfigs[0] 667 668 newScrapeConfig := config.DefaultScrapeConfig 669 newScrapeConfig.JobName = oldScrapeCfg.JobName 670 newScrapeConfig.Scheme = oldScrapeCfg.Scheme 671 scraperHttpClientConfig := common_config.HTTPClientConfig{ 672 TLSConfig: common_config.TLSConfig{ 673 // This is the self-scrape, so no need to enforce TLS check 674 InsecureSkipVerify: true, 675 }, 676 Authorization: &common_config.Authorization{ 677 Type: "Bearer", 678 Credentials: common_config.Secret(selfScraperToken), 679 }, 680 } 681 newScrapeConfig.HTTPClientConfig = scraperHttpClientConfig 682 newScrapeConfig.ServiceDiscoveryConfigs = make([]discovery.Config, 1) 683 newScrapeConfig.ServiceDiscoveryConfigs[0] = oldScrapeCfg.ServiceDiscoveryConfigs[0] 684 tempConfig.ScrapeConfigs[0] = &newScrapeConfig 685 686 if len(promCfg.ScrapeConfigs) > 1 { 687 for idx, cfg := range promCfg.ScrapeConfigs { 688 if idx != 0 { 689 tempConfig.ScrapeConfigs = append(tempConfig.ScrapeConfigs, cfg) 690 } 691 } 692 } 693 694 // Refresh the scraper token by reloading the scraper config 695 err = scrapeManager.ApplyConfig(&tempConfig) 696 697 if err != nil { 698 return fmt.Errorf("Failed to reapply scrape configs: %v", err) 699 } 700 701 if isDirector { 702 // Refresh service discovery token by re-configure scraper 703 if len(promCfg.ScrapeConfigs) < 2 { 704 return errors.New("Prometheus scraper config didn't include origins HTTP SD config. Length of configs less than 2.") 705 } 706 // Index 0 is the default config for servers 707 // Create new director-scrap token & service discovery token 708 promCfg.ScrapeConfigs[1], err = configDirectorPromScraper() 709 if err != nil { 710 return fmt.Errorf("Failed to generate token for director scraper when refresh it: %v", err) 711 } 712 } 713 714 c := make(map[string]discovery.Configs) 715 scfgs, err := promCfg.GetScrapeConfigs() 716 if err != nil { 717 return err 718 } 719 for _, v := range scfgs { 720 c[v.JobName] = v.ServiceDiscoveryConfigs 721 } 722 // We refresh the service discovery config for all the scrapers 723 if err := discoveryManagerScrape.ApplyConfig(c); err != nil { 724 err2 := level.Error(logger).Log("msg", fmt.Sprint("Scraper service discovery config periodic reload failed: ", err)) 725 _ = err2 726 return err 727 } 728 729 err = level.Info(logger).Log("msg", "Successfully reloaded scraper and service discovery config") 730 _ = err 731 return nil 732 }() 733 if err != nil { 734 return err 735 } 736 } 737 } 738 }, 739 func(err error) { 740 err2 := level.Info(logger).Log("msg", "Stopping scraper config periodic reload...") 741 _ = err2 742 // terminate reload 743 close(cancel) 744 }, 745 ) 746 } 747 { 748 // Scrape manager. 749 g.Add( 750 func() error { 751 // When the scrape manager receives a new targets list 752 // it needs to read a valid config for each job. 753 // It depends on the config being in sync with the discovery manager so 754 // we wait until the config is fully loaded. 755 <-reloadReady.C 756 757 err := scrapeManager.Run(discoveryManagerScrape.SyncCh()) 758 err2 := level.Info(logger).Log("msg", "Scrape manager stopped") 759 _ = err2 760 return err 761 }, 762 func(err error) { 763 // Scrape manager needs to be stopped before closing the local TSDB 764 // so that it doesn't try to write samples to a closed storage. 765 // We should also wait for rule manager to be fully stopped to ensure 766 // we don't trigger any false positive alerts for rules using absent(). 767 err2 := level.Info(logger).Log("msg", "Stopping scrape manager...") 768 _ = err2 769 scrapeManager.Stop() 770 }, 771 ) 772 } 773 { 774 cancel := make(chan struct{}) 775 g.Add( 776 func() error { 777 select { 778 case <-dbOpen: 779 case <-cancel: 780 reloadReady.Close() 781 return nil 782 } 783 784 if err := reloadConfig(&promCfg, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, reloaders...); err != nil { 785 return fmt.Errorf("error loading config: %w", err) 786 } 787 reloadReady.Close() 788 789 readyHandler.SetReady(true) 790 err2 := level.Info(logger).Log("msg", "Server is ready to receive web requests.") 791 _ = err2 792 <-cancel 793 return nil 794 }, 795 func(err error) { 796 close(cancel) 797 }, 798 ) 799 800 } 801 { 802 // TSDB. 803 opts := cfg.tsdb.ToTSDBOptions() 804 cancel := make(chan struct{}) 805 g.Add( 806 func() error { 807 err = level.Info(logger).Log("msg", "Starting TSDB ...") 808 _ = err 809 if cfg.tsdb.WALSegmentSize != 0 { 810 if cfg.tsdb.WALSegmentSize < 10*1024*1024 || cfg.tsdb.WALSegmentSize > 256*1024*1024 { 811 return errors.New("flag 'storage.tsdb.wal-segment-size' must be set between 10MB and 256MB") 812 } 813 } 814 if cfg.tsdb.MaxBlockChunkSegmentSize != 0 { 815 if cfg.tsdb.MaxBlockChunkSegmentSize < 1024*1024 { 816 return errors.New("flag 'storage.tsdb.max-block-chunk-segment-size' must be set over 1MB") 817 } 818 } 819 820 db, err := openDBWithMetrics(localStoragePath, logger, prometheus.DefaultRegisterer, &opts, localStorage.getStats()) 821 if err != nil { 822 return fmt.Errorf("opening storage failed: %w", err) 823 } 824 825 err = level.Info(logger).Log("msg", "TSDB started") 826 _ = err 827 err = level.Debug(logger).Log("msg", "TSDB options", 828 "MinBlockDuration", cfg.tsdb.MinBlockDuration, 829 "MaxBlockDuration", cfg.tsdb.MaxBlockDuration, 830 "MaxBytes", cfg.tsdb.MaxBytes, 831 "NoLockfile", cfg.tsdb.NoLockfile, 832 "RetentionDuration", cfg.tsdb.RetentionDuration, 833 "WALSegmentSize", cfg.tsdb.WALSegmentSize, 834 "WALCompression", cfg.tsdb.WALCompression, 835 ) 836 _ = err 837 838 startTimeMargin := int64(2 * time.Duration(cfg.tsdb.MinBlockDuration).Seconds() * 1000) 839 localStorage.Set(db, startTimeMargin) 840 //db.SetWriteNotified(remoteStorage) 841 close(dbOpen) 842 <-cancel 843 return nil 844 }, 845 func(err error) { 846 if err := fanoutStorage.Close(); err != nil { 847 err = level.Error(logger).Log("msg", "Error stopping storage", "err", err) 848 _ = err 849 } 850 close(cancel) 851 }, 852 ) 853 } 854 go func() { 855 if err := g.Run(); err != nil { 856 err = level.Error(logger).Log("err", err) 857 _ = err 858 } 859 }() 860 861 return nil 862 } 863 864 func openDBWithMetrics(dir string, logger log.Logger, reg prometheus.Registerer, opts *tsdb.Options, stats *tsdb.DBStats) (*tsdb.DB, error) { 865 db, err := tsdb.Open( 866 dir, 867 log.With(logger, "component", "tsdb"), 868 reg, 869 opts, 870 stats, 871 ) 872 if err != nil { 873 return nil, err 874 } 875 876 reg.MustRegister( 877 prometheus.NewGaugeFunc(prometheus.GaugeOpts{ 878 Name: "prometheus_tsdb_lowest_timestamp_seconds", 879 Help: "Lowest timestamp value stored in the database.", 880 }, func() float64 { 881 bb := db.Blocks() 882 if len(bb) == 0 { 883 return float64(db.Head().MinTime() / 1000) 884 } 885 return float64(db.Blocks()[0].Meta().MinTime / 1000) 886 }), prometheus.NewGaugeFunc(prometheus.GaugeOpts{ 887 Name: "prometheus_tsdb_head_min_time_seconds", 888 Help: "Minimum time bound of the head block.", 889 }, func() float64 { return float64(db.Head().MinTime() / 1000) }), 890 prometheus.NewGaugeFunc(prometheus.GaugeOpts{ 891 Name: "prometheus_tsdb_head_max_time_seconds", 892 Help: "Maximum timestamp of the head block.", 893 }, func() float64 { return float64(db.Head().MaxTime() / 1000) }), 894 ) 895 896 return db, nil 897 } 898 899 type safePromQLNoStepSubqueryInterval struct { 900 value atomic.Int64 901 } 902 903 func durationToInt64Millis(d time.Duration) int64 { 904 return int64(d / time.Millisecond) 905 } 906 907 func (i *safePromQLNoStepSubqueryInterval) Set(ev model.Duration) { 908 i.value.Store(durationToInt64Millis(time.Duration(ev))) 909 } 910 911 func (i *safePromQLNoStepSubqueryInterval) Get(int64) int64 { 912 return i.value.Load() 913 } 914 915 type reloader struct { 916 name string 917 reloader func(*config.Config) error 918 } 919 920 func reloadConfig(conf *config.Config, expandExternalLabels, enableExemplarStorage bool, logger log.Logger, noStepSuqueryInterval *safePromQLNoStepSubqueryInterval, rls ...reloader) (err error) { 921 start := time.Now() 922 timings := []interface{}{} 923 924 { 925 globalConfigMtx.Lock() 926 defer globalConfigMtx.Unlock() 927 globalConfig = *conf 928 } 929 930 failed := false 931 for _, rl := range rls { 932 rstart := time.Now() 933 if err := rl.reloader(conf); err != nil { 934 err = level.Error(logger).Log("msg", "Failed to apply configuration", "err", err) 935 _ = err 936 failed = true 937 } 938 timings = append(timings, rl.name, time.Since(rstart)) 939 } 940 if failed { 941 return fmt.Errorf("one or more errors occurred while applying the new configuration") 942 } 943 944 noStepSuqueryInterval.Set(conf.GlobalConfig.EvaluationInterval) 945 l := []interface{}{"msg", "Completed loading of configuration", "totalDuration", time.Since(start)} 946 err = level.Info(logger).Log(append(l, timings...)...) 947 _ = err 948 return nil 949 } 950 951 // compileCORSRegexString compiles given string and adds anchors 952 func compileCORSRegexString(s string) (*regexp.Regexp, error) { 953 r, err := relabel.NewRegexp(s) 954 if err != nil { 955 return nil, err 956 } 957 return r.Regexp, nil 958 } 959 960 // readyStorage implements the Storage interface while allowing to set the actual 961 // storage at a later point in time. 962 type readyStorage struct { 963 mtx sync.RWMutex 964 db storage.Storage 965 startTimeMargin int64 966 stats *tsdb.DBStats 967 } 968 969 func (s *readyStorage) ApplyConfig(conf *config.Config) error { 970 db := s.get() 971 if db, ok := db.(*tsdb.DB); ok { 972 return db.ApplyConfig(conf) 973 } 974 return nil 975 } 976 977 // Set the storage. 978 func (s *readyStorage) Set(db storage.Storage, startTimeMargin int64) { 979 s.mtx.Lock() 980 defer s.mtx.Unlock() 981 982 s.db = db 983 s.startTimeMargin = startTimeMargin 984 } 985 986 func (s *readyStorage) get() storage.Storage { 987 s.mtx.RLock() 988 x := s.db 989 s.mtx.RUnlock() 990 return x 991 } 992 993 func (s *readyStorage) getStats() *tsdb.DBStats { 994 s.mtx.RLock() 995 x := s.stats 996 s.mtx.RUnlock() 997 return x 998 } 999 1000 // StartTime implements the Storage interface. 1001 func (s *readyStorage) StartTime() (int64, error) { 1002 if x := s.get(); x != nil { 1003 switch db := x.(type) { 1004 case *tsdb.DB: 1005 var startTime int64 1006 if len(db.Blocks()) > 0 { 1007 startTime = db.Blocks()[0].Meta().MinTime 1008 } else { 1009 startTime = time.Now().Unix() * 1000 1010 } 1011 // Add a safety margin as it may take a few minutes for everything to spin up. 1012 return startTime + s.startTimeMargin, nil 1013 case *agent.DB: 1014 return db.StartTime() 1015 default: 1016 panic(fmt.Sprintf("unknown storage type %T", db)) 1017 } 1018 } 1019 1020 return math.MaxInt64, tsdb.ErrNotReady 1021 } 1022 1023 // Querier implements the Storage interface. 1024 func (s *readyStorage) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) { 1025 if x := s.get(); x != nil { 1026 return x.Querier(ctx, mint, maxt) 1027 } 1028 return nil, tsdb.ErrNotReady 1029 } 1030 1031 // ChunkQuerier implements the Storage interface. 1032 func (s *readyStorage) ChunkQuerier(ctx context.Context, mint, maxt int64) (storage.ChunkQuerier, error) { 1033 if x := s.get(); x != nil { 1034 return x.ChunkQuerier(ctx, mint, maxt) 1035 } 1036 return nil, tsdb.ErrNotReady 1037 } 1038 1039 func (s *readyStorage) ExemplarQuerier(ctx context.Context) (storage.ExemplarQuerier, error) { 1040 if x := s.get(); x != nil { 1041 switch db := x.(type) { 1042 case *tsdb.DB: 1043 return db.ExemplarQuerier(ctx) 1044 case *agent.DB: 1045 return nil, agent.ErrUnsupported 1046 default: 1047 panic(fmt.Sprintf("unknown storage type %T", db)) 1048 } 1049 } 1050 return nil, tsdb.ErrNotReady 1051 } 1052 1053 // Appender implements the Storage interface. 1054 func (s *readyStorage) Appender(ctx context.Context) storage.Appender { 1055 if x := s.get(); x != nil { 1056 return x.Appender(ctx) 1057 } 1058 return notReadyAppender{} 1059 } 1060 1061 type notReadyAppender struct{} 1062 1063 func (n notReadyAppender) Append(ref storage.SeriesRef, l labels.Labels, t int64, v float64) (storage.SeriesRef, error) { 1064 return 0, tsdb.ErrNotReady 1065 } 1066 1067 func (n notReadyAppender) AppendExemplar(ref storage.SeriesRef, l labels.Labels, e exemplar.Exemplar) (storage.SeriesRef, error) { 1068 return 0, tsdb.ErrNotReady 1069 } 1070 1071 func (n notReadyAppender) AppendHistogram(ref storage.SeriesRef, l labels.Labels, t int64, h *histogram.Histogram, fh *histogram.FloatHistogram) (storage.SeriesRef, error) { 1072 return 0, tsdb.ErrNotReady 1073 } 1074 1075 func (n notReadyAppender) UpdateMetadata(ref storage.SeriesRef, l labels.Labels, m metadata.Metadata) (storage.SeriesRef, error) { 1076 return 0, tsdb.ErrNotReady 1077 } 1078 1079 func (n notReadyAppender) Commit() error { return tsdb.ErrNotReady } 1080 1081 func (n notReadyAppender) Rollback() error { return tsdb.ErrNotReady } 1082 1083 // Close implements the Storage interface. 1084 func (s *readyStorage) Close() error { 1085 if x := s.get(); x != nil { 1086 return x.Close() 1087 } 1088 return nil 1089 } 1090 1091 // CleanTombstones implements the api_v1.TSDBAdminStats and api_v2.TSDBAdmin interfaces. 1092 func (s *readyStorage) CleanTombstones() error { 1093 if x := s.get(); x != nil { 1094 switch db := x.(type) { 1095 case *tsdb.DB: 1096 return db.CleanTombstones() 1097 case *agent.DB: 1098 return agent.ErrUnsupported 1099 default: 1100 panic(fmt.Sprintf("unknown storage type %T", db)) 1101 } 1102 } 1103 return tsdb.ErrNotReady 1104 } 1105 1106 // Delete implements the api_v1.TSDBAdminStats and api_v2.TSDBAdmin interfaces. 1107 func (s *readyStorage) Delete(mint, maxt int64, ms ...*labels.Matcher) error { 1108 if x := s.get(); x != nil { 1109 switch db := x.(type) { 1110 case *tsdb.DB: 1111 return db.Delete(mint, maxt, ms...) 1112 case *agent.DB: 1113 return agent.ErrUnsupported 1114 default: 1115 panic(fmt.Sprintf("unknown storage type %T", db)) 1116 } 1117 } 1118 return tsdb.ErrNotReady 1119 } 1120 1121 // Snapshot implements the api_v1.TSDBAdminStats and api_v2.TSDBAdmin interfaces. 1122 func (s *readyStorage) Snapshot(dir string, withHead bool) error { 1123 if x := s.get(); x != nil { 1124 switch db := x.(type) { 1125 case *tsdb.DB: 1126 return db.Snapshot(dir, withHead) 1127 case *agent.DB: 1128 return agent.ErrUnsupported 1129 default: 1130 panic(fmt.Sprintf("unknown storage type %T", db)) 1131 } 1132 } 1133 return tsdb.ErrNotReady 1134 } 1135 1136 // Stats implements the api_v1.TSDBAdminStats interface. 1137 func (s *readyStorage) Stats(statsByLabelName string, limit int) (*tsdb.Stats, error) { 1138 if x := s.get(); x != nil { 1139 switch db := x.(type) { 1140 case *tsdb.DB: 1141 return db.Head().Stats(statsByLabelName, limit), nil 1142 case *agent.DB: 1143 return nil, agent.ErrUnsupported 1144 default: 1145 panic(fmt.Sprintf("unknown storage type %T", db)) 1146 } 1147 } 1148 return nil, tsdb.ErrNotReady 1149 } 1150 1151 // WALReplayStatus implements the api_v1.TSDBStats interface. 1152 func (s *readyStorage) WALReplayStatus() (tsdb.WALReplayStatus, error) { 1153 if x := s.getStats(); x != nil { 1154 return x.Head.WALReplayStatus.GetWALReplayStatus(), nil 1155 } 1156 return tsdb.WALReplayStatus{}, tsdb.ErrNotReady 1157 } 1158 1159 // ErrNotReady is returned if the underlying scrape manager is not ready yet. 1160 var ErrNotReady = errors.New("Scrape manager not ready") 1161 1162 // ReadyScrapeManager allows a scrape manager to be retrieved. Even if it's set at a later point in time. 1163 type readyScrapeManager struct { 1164 mtx sync.RWMutex 1165 m *scrape.Manager 1166 } 1167 1168 // Set the scrape manager. 1169 func (rm *readyScrapeManager) Set(m *scrape.Manager) { 1170 rm.mtx.Lock() 1171 defer rm.mtx.Unlock() 1172 1173 rm.m = m 1174 } 1175 1176 // Get the scrape manager. If is not ready, return an error. 1177 func (rm *readyScrapeManager) Get() (*scrape.Manager, error) { 1178 rm.mtx.RLock() 1179 defer rm.mtx.RUnlock() 1180 1181 if rm.m != nil { 1182 return rm.m, nil 1183 } 1184 1185 return nil, ErrNotReady 1186 } 1187 1188 // tsdbOptions is tsdb.Option version with defined units. 1189 // This is required as tsdb.Option fields are unit agnostic (time). 1190 type tsdbOptions struct { 1191 WALSegmentSize units.Base2Bytes 1192 MaxBlockChunkSegmentSize units.Base2Bytes 1193 RetentionDuration model.Duration 1194 MaxBytes units.Base2Bytes 1195 NoLockfile bool 1196 WALCompression bool 1197 WALCompressionType string 1198 HeadChunksWriteQueueSize int 1199 SamplesPerChunk int 1200 StripeSize int 1201 MinBlockDuration model.Duration 1202 MaxBlockDuration model.Duration 1203 OutOfOrderTimeWindow int64 1204 EnableExemplarStorage bool 1205 MaxExemplars int64 1206 EnableMemorySnapshotOnShutdown bool 1207 EnableNativeHistograms bool 1208 } 1209 1210 func (opts tsdbOptions) ToTSDBOptions() tsdb.Options { 1211 return tsdb.Options{ 1212 WALSegmentSize: int(opts.WALSegmentSize), 1213 MaxBlockChunkSegmentSize: int64(opts.MaxBlockChunkSegmentSize), 1214 RetentionDuration: int64(time.Duration(opts.RetentionDuration) / time.Millisecond), 1215 MaxBytes: int64(opts.MaxBytes), 1216 NoLockfile: opts.NoLockfile, 1217 AllowOverlappingCompaction: true, 1218 WALCompression: wlog.ParseCompressionType(opts.WALCompression, opts.WALCompressionType), 1219 HeadChunksWriteQueueSize: opts.HeadChunksWriteQueueSize, 1220 SamplesPerChunk: opts.SamplesPerChunk, 1221 StripeSize: opts.StripeSize, 1222 MinBlockDuration: int64(time.Duration(opts.MinBlockDuration) / time.Millisecond), 1223 MaxBlockDuration: int64(time.Duration(opts.MaxBlockDuration) / time.Millisecond), 1224 EnableExemplarStorage: opts.EnableExemplarStorage, 1225 MaxExemplars: opts.MaxExemplars, 1226 EnableMemorySnapshotOnShutdown: opts.EnableMemorySnapshotOnShutdown, 1227 EnableNativeHistograms: opts.EnableNativeHistograms, 1228 OutOfOrderTimeWindow: opts.OutOfOrderTimeWindow, 1229 } 1230 } 1231 1232 // discoveryManager interfaces the discovery manager. This is used to keep using 1233 // the manager that restarts SD's on reload for a few releases until we feel 1234 // the new manager can be enabled for all users. 1235 type discoveryManager interface { 1236 ApplyConfig(cfg map[string]discovery.Configs) error 1237 Run() error 1238 SyncCh() <-chan map[string][]*targetgroup.Group 1239 }