github.com/thanos-io/thanos@v0.32.5/cmd/thanos/receive.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 package main 5 6 import ( 7 "context" 8 "os" 9 "path" 10 "strings" 11 "time" 12 13 extflag "github.com/efficientgo/tools/extkingpin" 14 "github.com/go-kit/log" 15 "github.com/go-kit/log/level" 16 grpc_logging "github.com/grpc-ecosystem/go-grpc-middleware/v2/interceptors/logging" 17 "github.com/grpc-ecosystem/go-grpc-middleware/v2/interceptors/tags" 18 "github.com/oklog/run" 19 "github.com/opentracing/opentracing-go" 20 "github.com/pkg/errors" 21 "github.com/prometheus/client_golang/prometheus" 22 "github.com/prometheus/client_golang/prometheus/promauto" 23 "github.com/prometheus/common/model" 24 "github.com/prometheus/prometheus/model/labels" 25 "github.com/prometheus/prometheus/model/relabel" 26 "github.com/prometheus/prometheus/tsdb" 27 "github.com/prometheus/prometheus/tsdb/wlog" 28 "google.golang.org/grpc" 29 "gopkg.in/yaml.v2" 30 31 "github.com/thanos-io/objstore" 32 "github.com/thanos-io/objstore/client" 33 objstoretracing "github.com/thanos-io/objstore/tracing/opentracing" 34 35 "github.com/thanos-io/thanos/pkg/block/metadata" 36 "github.com/thanos-io/thanos/pkg/component" 37 "github.com/thanos-io/thanos/pkg/exemplars" 38 "github.com/thanos-io/thanos/pkg/extgrpc" 39 "github.com/thanos-io/thanos/pkg/extgrpc/snappy" 40 "github.com/thanos-io/thanos/pkg/extkingpin" 41 "github.com/thanos-io/thanos/pkg/extprom" 42 "github.com/thanos-io/thanos/pkg/info" 43 "github.com/thanos-io/thanos/pkg/info/infopb" 44 "github.com/thanos-io/thanos/pkg/logging" 45 "github.com/thanos-io/thanos/pkg/prober" 46 "github.com/thanos-io/thanos/pkg/receive" 47 "github.com/thanos-io/thanos/pkg/runutil" 48 grpcserver "github.com/thanos-io/thanos/pkg/server/grpc" 49 httpserver "github.com/thanos-io/thanos/pkg/server/http" 50 "github.com/thanos-io/thanos/pkg/store" 51 "github.com/thanos-io/thanos/pkg/store/labelpb" 52 "github.com/thanos-io/thanos/pkg/tenancy" 53 "github.com/thanos-io/thanos/pkg/tls" 54 ) 55 56 const compressionNone = "none" 57 58 func registerReceive(app *extkingpin.App) { 59 cmd := app.Command(component.Receive.String(), "Accept Prometheus remote write API requests and write to local tsdb.") 60 61 conf := &receiveConfig{} 62 conf.registerFlag(cmd) 63 64 cmd.Setup(func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ <-chan struct{}, debugLogging bool) error { 65 lset, err := parseFlagLabels(conf.labelStrs) 66 if err != nil { 67 return errors.Wrap(err, "parse labels") 68 } 69 70 if !model.LabelName.IsValid(model.LabelName(conf.tenantLabelName)) { 71 return errors.Errorf("unsupported format for tenant label name, got %s", conf.tenantLabelName) 72 } 73 if len(lset) == 0 { 74 return errors.New("no external labels configured for receive, uniquely identifying external labels must be configured (ideally with `receive_` prefix); see https://thanos.io/tip/thanos/storage.md#external-labels for details.") 75 } 76 77 tagOpts, grpcLogOpts, err := logging.ParsegRPCOptions("", conf.reqLogConfig) 78 if err != nil { 79 return errors.Wrap(err, "error while parsing config for request logging") 80 } 81 82 tsdbOpts := &tsdb.Options{ 83 MinBlockDuration: int64(time.Duration(*conf.tsdbMinBlockDuration) / time.Millisecond), 84 MaxBlockDuration: int64(time.Duration(*conf.tsdbMaxBlockDuration) / time.Millisecond), 85 RetentionDuration: int64(time.Duration(*conf.retention) / time.Millisecond), 86 OutOfOrderTimeWindow: int64(time.Duration(*conf.tsdbOutOfOrderTimeWindow) / time.Millisecond), 87 OutOfOrderCapMax: conf.tsdbOutOfOrderCapMax, 88 NoLockfile: conf.noLockFile, 89 WALCompression: wlog.ParseCompressionType(conf.walCompression, string(wlog.CompressionSnappy)), 90 MaxExemplars: conf.tsdbMaxExemplars, 91 EnableExemplarStorage: conf.tsdbMaxExemplars > 0, 92 HeadChunksWriteQueueSize: int(conf.tsdbWriteQueueSize), 93 EnableMemorySnapshotOnShutdown: conf.tsdbMemorySnapshotOnShutdown, 94 EnableNativeHistograms: conf.tsdbEnableNativeHistograms, 95 } 96 97 // Are we running in IngestorOnly, RouterOnly or RouterIngestor mode? 98 receiveMode := conf.determineMode() 99 100 return runReceive( 101 g, 102 logger, 103 debugLogging, 104 reg, 105 tracer, 106 grpcLogOpts, tagOpts, 107 tsdbOpts, 108 lset, 109 component.Receive, 110 metadata.HashFunc(conf.hashFunc), 111 receiveMode, 112 conf, 113 ) 114 }) 115 } 116 117 func runReceive( 118 g *run.Group, 119 logger log.Logger, 120 debugLogging bool, 121 reg *prometheus.Registry, 122 tracer opentracing.Tracer, 123 grpcLogOpts []grpc_logging.Option, 124 tagOpts []tags.Option, 125 tsdbOpts *tsdb.Options, 126 lset labels.Labels, 127 comp component.SourceStoreAPI, 128 hashFunc metadata.HashFunc, 129 receiveMode receive.ReceiverMode, 130 conf *receiveConfig, 131 ) error { 132 logger = log.With(logger, "component", "receive") 133 134 level.Info(logger).Log("mode", receiveMode, "msg", "running receive") 135 136 rwTLSConfig, err := tls.NewServerConfig(log.With(logger, "protocol", "HTTP"), conf.rwServerCert, conf.rwServerKey, conf.rwServerClientCA) 137 if err != nil { 138 return err 139 } 140 141 dialOpts, err := extgrpc.StoreClientGRPCOpts( 142 logger, 143 reg, 144 tracer, 145 conf.grpcConfig.tlsSrvCert != "", 146 conf.grpcConfig.tlsSrvClientCA == "", 147 conf.rwClientCert, 148 conf.rwClientKey, 149 conf.rwClientServerCA, 150 conf.rwClientServerName, 151 ) 152 if err != nil { 153 return err 154 } 155 if conf.compression != compressionNone { 156 dialOpts = append(dialOpts, grpc.WithDefaultCallOptions(grpc.UseCompressor(conf.compression))) 157 } 158 159 var bkt objstore.Bucket 160 confContentYaml, err := conf.objStoreConfig.Content() 161 if err != nil { 162 return err 163 } 164 165 // Has this thanos receive instance been configured to ingest metrics into a local TSDB? 166 enableIngestion := receiveMode == receive.IngestorOnly || receiveMode == receive.RouterIngestor 167 168 upload := len(confContentYaml) > 0 169 if enableIngestion { 170 if upload { 171 if tsdbOpts.MinBlockDuration != tsdbOpts.MaxBlockDuration { 172 if !conf.ignoreBlockSize { 173 return errors.Errorf("found that TSDB Max time is %d and Min time is %d. "+ 174 "Compaction needs to be disabled (tsdb.min-block-duration = tsdb.max-block-duration)", tsdbOpts.MaxBlockDuration, tsdbOpts.MinBlockDuration) 175 } 176 level.Warn(logger).Log("msg", "flag to ignore min/max block duration flags differing is being used. If the upload of a 2h block fails and a tsdb compaction happens that block may be missing from your Thanos bucket storage.") 177 } 178 // The background shipper continuously scans the data directory and uploads 179 // new blocks to object storage service. 180 bkt, err = client.NewBucket(logger, confContentYaml, comp.String()) 181 if err != nil { 182 return err 183 } 184 bkt = objstoretracing.WrapWithTraces(objstore.WrapWithMetrics(bkt, extprom.WrapRegistererWithPrefix("thanos_", reg), bkt.Name())) 185 } else { 186 level.Info(logger).Log("msg", "no supported bucket was configured, uploads will be disabled") 187 } 188 } 189 190 // TODO(brancz): remove after a couple of versions 191 // Migrate non-multi-tsdb capable storage to multi-tsdb disk layout. 192 if err := migrateLegacyStorage(logger, conf.dataDir, conf.defaultTenantID); err != nil { 193 return errors.Wrapf(err, "migrate legacy storage in %v to default tenant %v", conf.dataDir, conf.defaultTenantID) 194 } 195 196 relabelContentYaml, err := conf.relabelConfigPath.Content() 197 if err != nil { 198 return errors.Wrap(err, "get content of relabel configuration") 199 } 200 var relabelConfig []*relabel.Config 201 if err := yaml.Unmarshal(relabelContentYaml, &relabelConfig); err != nil { 202 return errors.Wrap(err, "parse relabel configuration") 203 } 204 205 dbs := receive.NewMultiTSDB( 206 conf.dataDir, 207 logger, 208 reg, 209 tsdbOpts, 210 lset, 211 conf.tenantLabelName, 212 bkt, 213 conf.allowOutOfOrderUpload, 214 hashFunc, 215 ) 216 writer := receive.NewWriter(log.With(logger, "component", "receive-writer"), dbs, &receive.WriterOptions{ 217 Intern: conf.writerInterning, 218 TooFarInFutureTimeWindow: int64(time.Duration(*conf.tsdbTooFarInFutureTimeWindow)), 219 }) 220 221 var limitsConfig *receive.RootLimitsConfig 222 if conf.writeLimitsConfig != nil { 223 limitsContentYaml, err := conf.writeLimitsConfig.Content() 224 if err != nil { 225 return errors.Wrap(err, "get content of limit configuration") 226 } 227 limitsConfig, err = receive.ParseRootLimitConfig(limitsContentYaml) 228 if err != nil { 229 return errors.Wrap(err, "parse limit configuration") 230 } 231 } 232 limiter, err := receive.NewLimiter(conf.writeLimitsConfig, reg, receiveMode, log.With(logger, "component", "receive-limiter"), conf.limitsConfigReloadTimer) 233 if err != nil { 234 return errors.Wrap(err, "creating limiter") 235 } 236 237 webHandler := receive.NewHandler(log.With(logger, "component", "receive-handler"), &receive.Options{ 238 Writer: writer, 239 ListenAddress: conf.rwAddress, 240 Registry: reg, 241 Endpoint: conf.endpoint, 242 TenantHeader: conf.tenantHeader, 243 TenantField: conf.tenantField, 244 DefaultTenantID: conf.defaultTenantID, 245 ReplicaHeader: conf.replicaHeader, 246 ReplicationFactor: conf.replicationFactor, 247 RelabelConfigs: relabelConfig, 248 ReceiverMode: receiveMode, 249 Tracer: tracer, 250 TLSConfig: rwTLSConfig, 251 DialOpts: dialOpts, 252 ForwardTimeout: time.Duration(*conf.forwardTimeout), 253 MaxBackoff: time.Duration(*conf.maxBackoff), 254 TSDBStats: dbs, 255 Limiter: limiter, 256 }) 257 258 grpcProbe := prober.NewGRPC() 259 httpProbe := prober.NewHTTP() 260 statusProber := prober.Combine( 261 httpProbe, 262 grpcProbe, 263 prober.NewInstrumentation(comp, logger, extprom.WrapRegistererWithPrefix("thanos_", reg)), 264 ) 265 266 // Start all components while we wait for TSDB to open but only load 267 // initial config and mark ourselves as ready after it completes. 268 269 // hashringChangedChan signals when TSDB needs to be flushed and updated due to hashring config change. 270 hashringChangedChan := make(chan struct{}, 1) 271 272 if enableIngestion { 273 // uploadC signals when new blocks should be uploaded. 274 uploadC := make(chan struct{}, 1) 275 // uploadDone signals when uploading has finished. 276 uploadDone := make(chan struct{}, 1) 277 278 level.Debug(logger).Log("msg", "setting up TSDB") 279 { 280 if err := startTSDBAndUpload(g, logger, reg, dbs, uploadC, hashringChangedChan, upload, uploadDone, statusProber, bkt, receive.HashringAlgorithm(conf.hashringsAlgorithm)); err != nil { 281 return err 282 } 283 } 284 } 285 286 level.Debug(logger).Log("msg", "setting up hashring") 287 { 288 if err := setupHashring(g, logger, reg, conf, hashringChangedChan, webHandler, statusProber, enableIngestion, dbs); err != nil { 289 return err 290 } 291 } 292 293 level.Debug(logger).Log("msg", "setting up HTTP server") 294 { 295 srv := httpserver.New(logger, reg, comp, httpProbe, 296 httpserver.WithListen(*conf.httpBindAddr), 297 httpserver.WithGracePeriod(time.Duration(*conf.httpGracePeriod)), 298 httpserver.WithTLSConfig(*conf.httpTLSConfig), 299 ) 300 g.Add(func() error { 301 statusProber.Healthy() 302 return srv.ListenAndServe() 303 }, func(err error) { 304 statusProber.NotReady(err) 305 defer statusProber.NotHealthy(err) 306 307 srv.Shutdown(err) 308 }) 309 } 310 311 level.Debug(logger).Log("msg", "setting up gRPC server") 312 { 313 tlsCfg, err := tls.NewServerConfig(log.With(logger, "protocol", "gRPC"), conf.grpcConfig.tlsSrvCert, conf.grpcConfig.tlsSrvKey, conf.grpcConfig.tlsSrvClientCA) 314 if err != nil { 315 return errors.Wrap(err, "setup gRPC server") 316 } 317 318 options := []store.ProxyStoreOption{} 319 if debugLogging { 320 options = append(options, store.WithProxyStoreDebugLogging()) 321 } 322 323 proxy := store.NewProxyStore( 324 logger, 325 reg, 326 dbs.TSDBLocalClients, 327 comp, 328 labels.Labels{}, 329 0, 330 store.LazyRetrieval, 331 options..., 332 ) 333 mts := store.NewLimitedStoreServer(store.NewInstrumentedStoreServer(reg, proxy), reg, conf.storeRateLimits) 334 rw := store.ReadWriteTSDBStore{ 335 StoreServer: mts, 336 WriteableStoreServer: webHandler, 337 } 338 339 infoSrv := info.NewInfoServer( 340 component.Receive.String(), 341 info.WithLabelSetFunc(func() []labelpb.ZLabelSet { return proxy.LabelSet() }), 342 info.WithStoreInfoFunc(func() *infopb.StoreInfo { 343 if httpProbe.IsReady() { 344 minTime, maxTime := proxy.TimeRange() 345 return &infopb.StoreInfo{ 346 MinTime: minTime, 347 MaxTime: maxTime, 348 SupportsSharding: true, 349 SupportsWithoutReplicaLabels: true, 350 TsdbInfos: proxy.TSDBInfos(), 351 } 352 } 353 return nil 354 }), 355 info.WithExemplarsInfoFunc(), 356 ) 357 358 srv := grpcserver.New(logger, receive.NewUnRegisterer(reg), tracer, grpcLogOpts, tagOpts, comp, grpcProbe, 359 grpcserver.WithServer(store.RegisterStoreServer(rw, logger)), 360 grpcserver.WithServer(store.RegisterWritableStoreServer(rw)), 361 grpcserver.WithServer(exemplars.RegisterExemplarsServer(exemplars.NewMultiTSDB(dbs.TSDBExemplars))), 362 grpcserver.WithServer(info.RegisterInfoServer(infoSrv)), 363 grpcserver.WithListen(conf.grpcConfig.bindAddress), 364 grpcserver.WithGracePeriod(conf.grpcConfig.gracePeriod), 365 grpcserver.WithMaxConnAge(conf.grpcConfig.maxConnectionAge), 366 grpcserver.WithTLSConfig(tlsCfg), 367 ) 368 369 g.Add( 370 func() error { 371 level.Info(logger).Log("msg", "listening for StoreAPI and WritableStoreAPI gRPC", "address", conf.grpcConfig.bindAddress) 372 statusProber.Healthy() 373 return srv.ListenAndServe() 374 }, 375 func(err error) { 376 statusProber.NotReady(err) 377 defer statusProber.NotHealthy(err) 378 379 srv.Shutdown(err) 380 }, 381 ) 382 } 383 384 level.Debug(logger).Log("msg", "setting up receive HTTP handler") 385 { 386 g.Add( 387 func() error { 388 return errors.Wrap(webHandler.Run(), "error starting web server") 389 }, 390 func(err error) { 391 webHandler.Close() 392 }, 393 ) 394 } 395 396 if limitsConfig.AreHeadSeriesLimitsConfigured() { 397 level.Info(logger).Log("msg", "setting up periodic (every 15s) meta-monitoring query for limiting cache") 398 { 399 ctx, cancel := context.WithCancel(context.Background()) 400 g.Add(func() error { 401 return runutil.Repeat(15*time.Second, ctx.Done(), func() error { 402 if err := limiter.HeadSeriesLimiter.QueryMetaMonitoring(ctx); err != nil { 403 level.Error(logger).Log("msg", "failed to query meta-monitoring", "err", err.Error()) 404 } 405 return nil 406 }) 407 }, func(err error) { 408 cancel() 409 }) 410 } 411 } 412 413 level.Debug(logger).Log("msg", "setting up periodic tenant pruning") 414 { 415 ctx, cancel := context.WithCancel(context.Background()) 416 g.Add(func() error { 417 return runutil.Repeat(2*time.Hour, ctx.Done(), func() error { 418 if err := dbs.Prune(ctx); err != nil { 419 level.Error(logger).Log("err", err) 420 } 421 return nil 422 }) 423 }, func(err error) { 424 cancel() 425 }) 426 } 427 428 { 429 if limiter.CanReload() { 430 ctx, cancel := context.WithCancel(context.Background()) 431 g.Add(func() error { 432 level.Debug(logger).Log("msg", "limits config initialized with file watcher.") 433 if err := limiter.StartConfigReloader(ctx); err != nil { 434 return err 435 } 436 <-ctx.Done() 437 return nil 438 }, func(err error) { 439 cancel() 440 }) 441 } 442 } 443 444 level.Info(logger).Log("msg", "starting receiver") 445 return nil 446 } 447 448 // setupHashring sets up the hashring configuration provided. 449 // If no hashring is provided, we setup a single node hashring with local endpoint. 450 func setupHashring(g *run.Group, 451 logger log.Logger, 452 reg *prometheus.Registry, 453 conf *receiveConfig, 454 hashringChangedChan chan struct{}, 455 webHandler *receive.Handler, 456 statusProber prober.Probe, 457 enableIngestion bool, 458 dbs *receive.MultiTSDB, 459 ) error { 460 // Note: the hashring configuration watcher 461 // is the sender and thus closes the chan. 462 // In the single-node case, which has no configuration 463 // watcher, we close the chan ourselves. 464 updates := make(chan []receive.HashringConfig, 1) 465 algorithm := receive.HashringAlgorithm(conf.hashringsAlgorithm) 466 467 // The Hashrings config file path is given initializing config watcher. 468 if conf.hashringsFilePath != "" { 469 cw, err := receive.NewConfigWatcher(log.With(logger, "component", "config-watcher"), reg, conf.hashringsFilePath, *conf.refreshInterval) 470 if err != nil { 471 return errors.Wrap(err, "failed to initialize config watcher") 472 } 473 474 // Check the hashring configuration on before running the watcher. 475 if err := cw.ValidateConfig(); err != nil { 476 cw.Stop() 477 close(updates) 478 return errors.Wrap(err, "failed to validate hashring configuration file") 479 } 480 481 ctx, cancel := context.WithCancel(context.Background()) 482 g.Add(func() error { 483 return receive.ConfigFromWatcher(ctx, updates, cw) 484 }, func(error) { 485 cancel() 486 }) 487 } else { 488 var ( 489 cf []receive.HashringConfig 490 err error 491 ) 492 // The Hashrings config file content given initialize configuration from content. 493 if len(conf.hashringsFileContent) > 0 { 494 cf, err = receive.ParseConfig([]byte(conf.hashringsFileContent)) 495 if err != nil { 496 close(updates) 497 return errors.Wrap(err, "failed to validate hashring configuration content") 498 } 499 } 500 501 cancel := make(chan struct{}) 502 g.Add(func() error { 503 defer close(updates) 504 updates <- cf 505 <-cancel 506 return nil 507 }, func(error) { 508 close(cancel) 509 }) 510 } 511 512 cancel := make(chan struct{}) 513 g.Add(func() error { 514 515 if enableIngestion { 516 defer close(hashringChangedChan) 517 } 518 519 for { 520 select { 521 case c, ok := <-updates: 522 if !ok { 523 return nil 524 } 525 526 if c == nil { 527 webHandler.Hashring(receive.SingleNodeHashring(conf.endpoint)) 528 level.Info(logger).Log("msg", "Empty hashring config. Set up single node hashring.") 529 } else { 530 h, err := receive.NewMultiHashring(algorithm, conf.replicationFactor, c) 531 if err != nil { 532 return errors.Wrap(err, "unable to create new hashring from config") 533 } 534 webHandler.Hashring(h) 535 level.Info(logger).Log("msg", "Set up hashring for the given hashring config.") 536 } 537 538 if err := dbs.SetHashringConfig(c); err != nil { 539 return errors.Wrap(err, "failed to set hashring config in MultiTSDB") 540 } 541 542 // If ingestion is enabled, send a signal to TSDB to flush. 543 if enableIngestion { 544 hashringChangedChan <- struct{}{} 545 } else { 546 // If not, just signal we are ready (this is important during first hashring load) 547 statusProber.Ready() 548 } 549 case <-cancel: 550 return nil 551 } 552 } 553 }, func(err error) { 554 close(cancel) 555 }, 556 ) 557 return nil 558 } 559 560 // startTSDBAndUpload starts the multi-TSDB and sets up the rungroup to flush the TSDB and reload on hashring change. 561 // It also upload blocks to object store, if upload is enabled. 562 func startTSDBAndUpload(g *run.Group, 563 logger log.Logger, 564 reg *prometheus.Registry, 565 dbs *receive.MultiTSDB, 566 uploadC chan struct{}, 567 hashringChangedChan chan struct{}, 568 upload bool, 569 uploadDone chan struct{}, 570 statusProber prober.Probe, 571 bkt objstore.Bucket, 572 hashringAlgorithm receive.HashringAlgorithm, 573 ) error { 574 575 log.With(logger, "component", "storage") 576 dbUpdatesStarted := promauto.With(reg).NewCounter(prometheus.CounterOpts{ 577 Name: "thanos_receive_multi_db_updates_attempted_total", 578 Help: "Number of Multi DB attempted reloads with flush and potential upload due to hashring changes", 579 }) 580 dbUpdatesCompleted := promauto.With(reg).NewCounter(prometheus.CounterOpts{ 581 Name: "thanos_receive_multi_db_updates_completed_total", 582 Help: "Number of Multi DB completed reloads with flush and potential upload due to hashring changes", 583 }) 584 585 level.Debug(logger).Log("msg", "removing storage lock files if any") 586 if err := dbs.RemoveLockFilesIfAny(); err != nil { 587 return errors.Wrap(err, "remove storage lock files") 588 } 589 590 // TSDBs reload logic, listening on hashring changes. 591 cancel := make(chan struct{}) 592 g.Add(func() error { 593 defer close(uploadC) 594 595 // Before quitting, ensure the WAL is flushed and the DBs are closed. 596 defer func() { 597 level.Info(logger).Log("msg", "shutting down storage") 598 if err := dbs.Flush(); err != nil { 599 level.Error(logger).Log("err", err, "msg", "failed to flush storage") 600 } else { 601 level.Info(logger).Log("msg", "storage is flushed successfully") 602 } 603 if err := dbs.Close(); err != nil { 604 level.Error(logger).Log("err", err, "msg", "failed to close storage") 605 return 606 } 607 level.Info(logger).Log("msg", "storage is closed") 608 }() 609 610 var initialized bool 611 for { 612 select { 613 case <-cancel: 614 return nil 615 case _, ok := <-hashringChangedChan: 616 if !ok { 617 return nil 618 } 619 620 // When using Ketama as the hashring algorithm, there is no need to flush the TSDB head. 621 // If new receivers were added to the hashring, existing receivers will not need to 622 // ingest additional series. 623 // If receivers are removed from the hashring, existing receivers will only need 624 // to ingest a subset of the series that were assigned to the removed receivers. 625 // As a result, changing the hashring produces no churn, hence no need to force 626 // head compaction and upload. 627 flushHead := !initialized || hashringAlgorithm != receive.AlgorithmKetama 628 if flushHead { 629 msg := "hashring has changed; server is not ready to receive requests" 630 statusProber.NotReady(errors.New(msg)) 631 level.Info(logger).Log("msg", msg) 632 633 level.Info(logger).Log("msg", "updating storage") 634 dbUpdatesStarted.Inc() 635 if err := dbs.Flush(); err != nil { 636 return errors.Wrap(err, "flushing storage") 637 } 638 if err := dbs.Open(); err != nil { 639 return errors.Wrap(err, "opening storage") 640 } 641 if upload { 642 uploadC <- struct{}{} 643 <-uploadDone 644 } 645 dbUpdatesCompleted.Inc() 646 statusProber.Ready() 647 level.Info(logger).Log("msg", "storage started, and server is ready to receive requests") 648 dbUpdatesCompleted.Inc() 649 } 650 initialized = true 651 } 652 } 653 }, func(err error) { 654 close(cancel) 655 }) 656 657 if upload { 658 logger := log.With(logger, "component", "uploader") 659 upload := func(ctx context.Context) error { 660 level.Debug(logger).Log("msg", "upload phase starting") 661 start := time.Now() 662 663 uploaded, err := dbs.Sync(ctx) 664 if err != nil { 665 level.Warn(logger).Log("msg", "upload failed", "elapsed", time.Since(start), "err", err) 666 return err 667 } 668 level.Debug(logger).Log("msg", "upload phase done", "uploaded", uploaded, "elapsed", time.Since(start)) 669 return nil 670 } 671 { 672 level.Info(logger).Log("msg", "upload enabled, starting initial sync") 673 if err := upload(context.Background()); err != nil { 674 return errors.Wrap(err, "initial upload failed") 675 } 676 level.Info(logger).Log("msg", "initial sync done") 677 } 678 { 679 ctx, cancel := context.WithCancel(context.Background()) 680 g.Add(func() error { 681 // Ensure we clean up everything properly. 682 defer func() { 683 runutil.CloseWithLogOnErr(logger, bkt, "bucket client") 684 }() 685 686 // Before quitting, ensure all blocks are uploaded. 687 defer func() { 688 <-uploadC // Closed by storage routine when it's done. 689 level.Info(logger).Log("msg", "uploading the final cut block before exiting") 690 ctx, cancel := context.WithCancel(context.Background()) 691 uploaded, err := dbs.Sync(ctx) 692 if err != nil { 693 cancel() 694 level.Error(logger).Log("msg", "the final upload failed", "err", err) 695 return 696 } 697 cancel() 698 level.Info(logger).Log("msg", "the final cut block was uploaded", "uploaded", uploaded) 699 }() 700 701 defer close(uploadDone) 702 703 // Run the uploader in a loop. 704 tick := time.NewTicker(30 * time.Second) 705 defer tick.Stop() 706 707 for { 708 select { 709 case <-ctx.Done(): 710 return nil 711 case <-uploadC: 712 // Upload on demand. 713 if err := upload(ctx); err != nil { 714 level.Error(logger).Log("msg", "on demand upload failed", "err", err) 715 } 716 uploadDone <- struct{}{} 717 case <-tick.C: 718 if err := upload(ctx); err != nil { 719 level.Error(logger).Log("msg", "recurring upload failed", "err", err) 720 } 721 } 722 } 723 }, func(error) { 724 cancel() 725 }) 726 } 727 } 728 729 return nil 730 } 731 732 func migrateLegacyStorage(logger log.Logger, dataDir, defaultTenantID string) error { 733 defaultTenantDataDir := path.Join(dataDir, defaultTenantID) 734 735 if _, err := os.Stat(defaultTenantDataDir); !os.IsNotExist(err) { 736 level.Info(logger).Log("msg", "default tenant data dir already present, not attempting to migrate storage") 737 return nil 738 } 739 740 if _, err := os.Stat(dataDir); os.IsNotExist(err) { 741 level.Info(logger).Log("msg", "no existing storage found, no data migration attempted") 742 return nil 743 } 744 745 level.Info(logger).Log("msg", "found legacy storage, migrating to multi-tsdb layout with default tenant", "defaultTenantID", defaultTenantID) 746 747 files, err := os.ReadDir(dataDir) 748 if err != nil { 749 return errors.Wrapf(err, "read legacy data dir: %v", dataDir) 750 } 751 752 if err := os.MkdirAll(defaultTenantDataDir, 0750); err != nil { 753 return errors.Wrapf(err, "create default tenant data dir: %v", defaultTenantDataDir) 754 } 755 756 for _, f := range files { 757 from := path.Join(dataDir, f.Name()) 758 to := path.Join(defaultTenantDataDir, f.Name()) 759 if err := os.Rename(from, to); err != nil { 760 return errors.Wrapf(err, "migrate file from %v to %v", from, to) 761 } 762 } 763 764 return nil 765 } 766 767 type receiveConfig struct { 768 httpBindAddr *string 769 httpGracePeriod *model.Duration 770 httpTLSConfig *string 771 772 grpcConfig grpcConfig 773 774 rwAddress string 775 rwServerCert string 776 rwServerKey string 777 rwServerClientCA string 778 rwClientCert string 779 rwClientKey string 780 rwClientServerCA string 781 rwClientServerName string 782 783 dataDir string 784 labelStrs []string 785 786 objStoreConfig *extflag.PathOrContent 787 retention *model.Duration 788 789 hashringsFilePath string 790 hashringsFileContent string 791 hashringsAlgorithm string 792 793 refreshInterval *model.Duration 794 endpoint string 795 tenantHeader string 796 tenantField string 797 tenantLabelName string 798 defaultTenantID string 799 replicaHeader string 800 replicationFactor uint64 801 forwardTimeout *model.Duration 802 maxBackoff *model.Duration 803 compression string 804 805 tsdbMinBlockDuration *model.Duration 806 tsdbMaxBlockDuration *model.Duration 807 tsdbTooFarInFutureTimeWindow *model.Duration 808 tsdbOutOfOrderTimeWindow *model.Duration 809 tsdbOutOfOrderCapMax int64 810 tsdbAllowOverlappingBlocks bool 811 tsdbMaxExemplars int64 812 tsdbWriteQueueSize int64 813 tsdbMemorySnapshotOnShutdown bool 814 tsdbEnableNativeHistograms bool 815 816 walCompression bool 817 noLockFile bool 818 writerInterning bool 819 820 hashFunc string 821 822 ignoreBlockSize bool 823 allowOutOfOrderUpload bool 824 825 reqLogConfig *extflag.PathOrContent 826 relabelConfigPath *extflag.PathOrContent 827 828 writeLimitsConfig *extflag.PathOrContent 829 storeRateLimits store.SeriesSelectLimits 830 limitsConfigReloadTimer time.Duration 831 } 832 833 func (rc *receiveConfig) registerFlag(cmd extkingpin.FlagClause) { 834 rc.httpBindAddr, rc.httpGracePeriod, rc.httpTLSConfig = extkingpin.RegisterHTTPFlags(cmd) 835 rc.grpcConfig.registerFlag(cmd) 836 rc.storeRateLimits.RegisterFlags(cmd) 837 838 cmd.Flag("remote-write.address", "Address to listen on for remote write requests."). 839 Default("0.0.0.0:19291").StringVar(&rc.rwAddress) 840 841 cmd.Flag("remote-write.server-tls-cert", "TLS Certificate for HTTP server, leave blank to disable TLS.").Default("").StringVar(&rc.rwServerCert) 842 843 cmd.Flag("remote-write.server-tls-key", "TLS Key for the HTTP server, leave blank to disable TLS.").Default("").StringVar(&rc.rwServerKey) 844 845 cmd.Flag("remote-write.server-tls-client-ca", "TLS CA to verify clients against. If no client CA is specified, there is no client verification on server side. (tls.NoClientCert)").Default("").StringVar(&rc.rwServerClientCA) 846 847 cmd.Flag("remote-write.client-tls-cert", "TLS Certificates to use to identify this client to the server.").Default("").StringVar(&rc.rwClientCert) 848 849 cmd.Flag("remote-write.client-tls-key", "TLS Key for the client's certificate.").Default("").StringVar(&rc.rwClientKey) 850 851 cmd.Flag("remote-write.client-tls-ca", "TLS CA Certificates to use to verify servers.").Default("").StringVar(&rc.rwClientServerCA) 852 853 cmd.Flag("remote-write.client-server-name", "Server name to verify the hostname on the returned TLS certificates. See https://tools.ietf.org/html/rfc4366#section-3.1").Default("").StringVar(&rc.rwClientServerName) 854 855 cmd.Flag("tsdb.path", "Data directory of TSDB."). 856 Default("./data").StringVar(&rc.dataDir) 857 858 cmd.Flag("label", "External labels to announce. This flag will be removed in the future when handling multiple tsdb instances is added.").PlaceHolder("key=\"value\"").StringsVar(&rc.labelStrs) 859 860 rc.objStoreConfig = extkingpin.RegisterCommonObjStoreFlags(cmd, "", false) 861 862 rc.retention = extkingpin.ModelDuration(cmd.Flag("tsdb.retention", "How long to retain raw samples on local storage. 0d - disables the retention policy (i.e. infinite retention). For more details on how retention is enforced for individual tenants, please refer to the Tenant lifecycle management section in the Receive documentation: https://thanos.io/tip/components/receive.md/#tenant-lifecycle-management").Default("15d")) 863 864 cmd.Flag("receive.hashrings-file", "Path to file that contains the hashring configuration. A watcher is initialized to watch changes and update the hashring dynamically.").PlaceHolder("<path>").StringVar(&rc.hashringsFilePath) 865 866 cmd.Flag("receive.hashrings", "Alternative to 'receive.hashrings-file' flag (lower priority). Content of file that contains the hashring configuration.").PlaceHolder("<content>").StringVar(&rc.hashringsFileContent) 867 868 hashringAlgorithmsHelptext := strings.Join([]string{string(receive.AlgorithmHashmod), string(receive.AlgorithmKetama)}, ", ") 869 cmd.Flag("receive.hashrings-algorithm", "The algorithm used when distributing series in the hashrings. Must be one of "+hashringAlgorithmsHelptext+". Will be overwritten by the tenant-specific algorithm in the hashring config."). 870 Default(string(receive.AlgorithmHashmod)). 871 EnumVar(&rc.hashringsAlgorithm, string(receive.AlgorithmHashmod), string(receive.AlgorithmKetama)) 872 873 rc.refreshInterval = extkingpin.ModelDuration(cmd.Flag("receive.hashrings-file-refresh-interval", "Refresh interval to re-read the hashring configuration file. (used as a fallback)"). 874 Default("5m")) 875 876 cmd.Flag("receive.local-endpoint", "Endpoint of local receive node. Used to identify the local node in the hashring configuration. If it's empty AND hashring configuration was provided, it means that receive will run in RoutingOnly mode.").StringVar(&rc.endpoint) 877 878 cmd.Flag("receive.tenant-header", "HTTP header to determine tenant for write requests.").Default(tenancy.DefaultTenantHeader).StringVar(&rc.tenantHeader) 879 880 cmd.Flag("receive.tenant-certificate-field", "Use TLS client's certificate field to determine tenant for write requests. Must be one of "+tenancy.CertificateFieldOrganization+", "+tenancy.CertificateFieldOrganizationalUnit+" or "+tenancy.CertificateFieldCommonName+". This setting will cause the receive.tenant-header flag value to be ignored.").Default("").EnumVar(&rc.tenantField, "", tenancy.CertificateFieldOrganization, tenancy.CertificateFieldOrganizationalUnit, tenancy.CertificateFieldCommonName) 881 882 cmd.Flag("receive.default-tenant-id", "Default tenant ID to use when none is provided via a header.").Default(tenancy.DefaultTenant).StringVar(&rc.defaultTenantID) 883 884 cmd.Flag("receive.tenant-label-name", "Label name through which the tenant will be announced.").Default(tenancy.DefaultTenantLabel).StringVar(&rc.tenantLabelName) 885 886 cmd.Flag("receive.replica-header", "HTTP header specifying the replica number of a write request.").Default(receive.DefaultReplicaHeader).StringVar(&rc.replicaHeader) 887 888 compressionOptions := strings.Join([]string{snappy.Name, compressionNone}, ", ") 889 cmd.Flag("receive.grpc-compression", "Compression algorithm to use for gRPC requests to other receivers. Must be one of: "+compressionOptions).Default(snappy.Name).EnumVar(&rc.compression, snappy.Name, compressionNone) 890 891 cmd.Flag("receive.replication-factor", "How many times to replicate incoming write requests.").Default("1").Uint64Var(&rc.replicationFactor) 892 893 rc.forwardTimeout = extkingpin.ModelDuration(cmd.Flag("receive-forward-timeout", "Timeout for each forward request.").Default("5s").Hidden()) 894 895 rc.maxBackoff = extkingpin.ModelDuration(cmd.Flag("receive-forward-max-backoff", "Maximum backoff for each forward fan-out request").Default("5s").Hidden()) 896 897 rc.relabelConfigPath = extflag.RegisterPathOrContent(cmd, "receive.relabel-config", "YAML file that contains relabeling configuration.", extflag.WithEnvSubstitution()) 898 899 rc.tsdbMinBlockDuration = extkingpin.ModelDuration(cmd.Flag("tsdb.min-block-duration", "Min duration for local TSDB blocks").Default("2h").Hidden()) 900 901 rc.tsdbMaxBlockDuration = extkingpin.ModelDuration(cmd.Flag("tsdb.max-block-duration", "Max duration for local TSDB blocks").Default("2h").Hidden()) 902 903 rc.tsdbTooFarInFutureTimeWindow = extkingpin.ModelDuration(cmd.Flag("tsdb.too-far-in-future.time-window", 904 "[EXPERIMENTAL] Configures the allowed time window for ingesting samples too far in the future. Disabled (0s) by default"+ 905 "Please note enable this flag will reject samples in the future of receive local NTP time + configured duration due to clock skew in remote write clients.", 906 ).Default("0s")) 907 908 rc.tsdbOutOfOrderTimeWindow = extkingpin.ModelDuration(cmd.Flag("tsdb.out-of-order.time-window", 909 "[EXPERIMENTAL] Configures the allowed time window for ingestion of out-of-order samples. Disabled (0s) by default"+ 910 "Please note if you enable this option and you use compactor, make sure you have the --enable-vertical-compaction flag enabled, otherwise you might risk compactor halt.", 911 ).Default("0s").Hidden()) 912 913 cmd.Flag("tsdb.out-of-order.cap-max", 914 "[EXPERIMENTAL] Configures the maximum capacity for out-of-order chunks (in samples). If set to <=0, default value 32 is assumed.", 915 ).Default("0").Hidden().Int64Var(&rc.tsdbOutOfOrderCapMax) 916 917 cmd.Flag("tsdb.allow-overlapping-blocks", "Allow overlapping blocks, which in turn enables vertical compaction and vertical query merge. Does not do anything, enabled all the time.").Default("false").BoolVar(&rc.tsdbAllowOverlappingBlocks) 918 919 cmd.Flag("tsdb.wal-compression", "Compress the tsdb WAL.").Default("true").BoolVar(&rc.walCompression) 920 921 cmd.Flag("tsdb.no-lockfile", "Do not create lockfile in TSDB data directory. In any case, the lockfiles will be deleted on next startup.").Default("false").BoolVar(&rc.noLockFile) 922 923 cmd.Flag("tsdb.max-exemplars", 924 "Enables support for ingesting exemplars and sets the maximum number of exemplars that will be stored per tenant."+ 925 " In case the exemplar storage becomes full (number of stored exemplars becomes equal to max-exemplars),"+ 926 " ingesting a new exemplar will evict the oldest exemplar from storage. 0 (or less) value of this flag disables exemplars storage."). 927 Default("0").Int64Var(&rc.tsdbMaxExemplars) 928 929 cmd.Flag("tsdb.write-queue-size", 930 "[EXPERIMENTAL] Enables configuring the size of the chunk write queue used in the head chunks mapper. "+ 931 "A queue size of zero (default) disables this feature entirely."). 932 Default("0").Hidden().Int64Var(&rc.tsdbWriteQueueSize) 933 934 cmd.Flag("tsdb.memory-snapshot-on-shutdown", 935 "[EXPERIMENTAL] Enables feature to snapshot in-memory chunks on shutdown for faster restarts."). 936 Default("false").Hidden().BoolVar(&rc.tsdbMemorySnapshotOnShutdown) 937 938 cmd.Flag("tsdb.enable-native-histograms", 939 "[EXPERIMENTAL] Enables the ingestion of native histograms."). 940 Default("false").Hidden().BoolVar(&rc.tsdbEnableNativeHistograms) 941 942 cmd.Flag("writer.intern", 943 "[EXPERIMENTAL] Enables string interning in receive writer, for more optimized memory usage."). 944 Default("false").Hidden().BoolVar(&rc.writerInterning) 945 946 cmd.Flag("hash-func", "Specify which hash function to use when calculating the hashes of produced files. If no function has been specified, it does not happen. This permits avoiding downloading some files twice albeit at some performance cost. Possible values are: \"\", \"SHA256\"."). 947 Default("").EnumVar(&rc.hashFunc, "SHA256", "") 948 949 cmd.Flag("shipper.ignore-unequal-block-size", "If true receive will not require min and max block size flags to be set to the same value. Only use this if you want to keep long retention and compaction enabled, as in the worst case it can result in ~2h data loss for your Thanos bucket storage.").Default("false").Hidden().BoolVar(&rc.ignoreBlockSize) 950 951 cmd.Flag("shipper.allow-out-of-order-uploads", 952 "If true, shipper will skip failed block uploads in the given iteration and retry later. This means that some newer blocks might be uploaded sooner than older blocks."+ 953 "This can trigger compaction without those blocks and as a result will create an overlap situation. Set it to true if you have vertical compaction enabled and wish to upload blocks as soon as possible without caring"+ 954 "about order."). 955 Default("false").Hidden().BoolVar(&rc.allowOutOfOrderUpload) 956 957 rc.reqLogConfig = extkingpin.RegisterRequestLoggingFlags(cmd) 958 959 rc.writeLimitsConfig = extflag.RegisterPathOrContent(cmd, "receive.limits-config", "YAML file that contains limit configuration.", extflag.WithEnvSubstitution(), extflag.WithHidden()) 960 cmd.Flag("receive.limits-config-reload-timer", "Minimum amount of time to pass for the limit configuration to be reloaded. Helps to avoid excessive reloads."). 961 Default("1s").Hidden().DurationVar(&rc.limitsConfigReloadTimer) 962 } 963 964 // determineMode returns the ReceiverMode that this receiver is configured to run in. 965 // This is used to configure this Receiver's forwarding and ingesting behavior at runtime. 966 func (rc *receiveConfig) determineMode() receive.ReceiverMode { 967 // Has the user provided some kind of hashring configuration? 968 hashringSpecified := rc.hashringsFileContent != "" || rc.hashringsFilePath != "" 969 // Has the user specified the --receive.local-endpoint flag? 970 localEndpointSpecified := rc.endpoint != "" 971 972 switch { 973 case hashringSpecified && localEndpointSpecified: 974 return receive.RouterIngestor 975 case hashringSpecified && !localEndpointSpecified: 976 // Be careful - if the hashring contains an address that routes to itself and does not specify a local 977 // endpoint - you've just created an infinite loop / fork bomb :) 978 return receive.RouterOnly 979 default: 980 // hashring configuration has not been provided so we ingest all metrics locally. 981 return receive.IngestorOnly 982 } 983 }