github.com/thanos-io/thanos@v0.32.5/cmd/thanos/sidecar.go (about)

     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     3  
     4  package main
     5  
     6  import (
     7  	"context"
     8  	"math"
     9  	"net/url"
    10  	"sync"
    11  	"time"
    12  
    13  	extflag "github.com/efficientgo/tools/extkingpin"
    14  	"github.com/go-kit/log"
    15  	"github.com/go-kit/log/level"
    16  	grpc_logging "github.com/grpc-ecosystem/go-grpc-middleware/v2/interceptors/logging"
    17  	"github.com/grpc-ecosystem/go-grpc-middleware/v2/interceptors/tags"
    18  	"github.com/oklog/run"
    19  	"github.com/opentracing/opentracing-go"
    20  	"github.com/pkg/errors"
    21  	"github.com/prometheus/client_golang/prometheus"
    22  	"github.com/prometheus/client_golang/prometheus/promauto"
    23  	"github.com/prometheus/common/model"
    24  	"github.com/prometheus/prometheus/model/labels"
    25  
    26  	"github.com/thanos-io/objstore"
    27  	"github.com/thanos-io/objstore/client"
    28  	objstoretracing "github.com/thanos-io/objstore/tracing/opentracing"
    29  
    30  	"github.com/thanos-io/thanos/pkg/block/metadata"
    31  	"github.com/thanos-io/thanos/pkg/component"
    32  	"github.com/thanos-io/thanos/pkg/exemplars"
    33  	"github.com/thanos-io/thanos/pkg/extkingpin"
    34  	"github.com/thanos-io/thanos/pkg/extprom"
    35  	"github.com/thanos-io/thanos/pkg/httpconfig"
    36  	"github.com/thanos-io/thanos/pkg/info"
    37  	"github.com/thanos-io/thanos/pkg/info/infopb"
    38  	"github.com/thanos-io/thanos/pkg/logging"
    39  	meta "github.com/thanos-io/thanos/pkg/metadata"
    40  	thanosmodel "github.com/thanos-io/thanos/pkg/model"
    41  	"github.com/thanos-io/thanos/pkg/prober"
    42  	"github.com/thanos-io/thanos/pkg/promclient"
    43  	"github.com/thanos-io/thanos/pkg/reloader"
    44  	"github.com/thanos-io/thanos/pkg/rules"
    45  	"github.com/thanos-io/thanos/pkg/runutil"
    46  	grpcserver "github.com/thanos-io/thanos/pkg/server/grpc"
    47  	httpserver "github.com/thanos-io/thanos/pkg/server/http"
    48  	"github.com/thanos-io/thanos/pkg/shipper"
    49  	"github.com/thanos-io/thanos/pkg/store"
    50  	"github.com/thanos-io/thanos/pkg/store/labelpb"
    51  	"github.com/thanos-io/thanos/pkg/targets"
    52  	"github.com/thanos-io/thanos/pkg/tls"
    53  )
    54  
    55  func registerSidecar(app *extkingpin.App) {
    56  	cmd := app.Command(component.Sidecar.String(), "Sidecar for Prometheus server.")
    57  	conf := &sidecarConfig{}
    58  	conf.registerFlag(cmd)
    59  	cmd.Setup(func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ <-chan struct{}, _ bool) error {
    60  		tagOpts, grpcLogOpts, err := logging.ParsegRPCOptions("", conf.reqLogConfig)
    61  		if err != nil {
    62  			return errors.Wrap(err, "error while parsing config for request logging")
    63  		}
    64  
    65  		rl := reloader.New(log.With(logger, "component", "reloader"),
    66  			extprom.WrapRegistererWithPrefix("thanos_sidecar_", reg),
    67  			&reloader.Options{
    68  				ReloadURL:     reloader.ReloadURLFromBase(conf.prometheus.url),
    69  				CfgFile:       conf.reloader.confFile,
    70  				CfgOutputFile: conf.reloader.envVarConfFile,
    71  				WatchedDirs:   conf.reloader.ruleDirectories,
    72  				WatchInterval: conf.reloader.watchInterval,
    73  				RetryInterval: conf.reloader.retryInterval,
    74  			})
    75  
    76  		return runSidecar(g, logger, reg, tracer, rl, component.Sidecar, *conf, grpcLogOpts, tagOpts)
    77  	})
    78  }
    79  
    80  func runSidecar(
    81  	g *run.Group,
    82  	logger log.Logger,
    83  	reg *prometheus.Registry,
    84  	tracer opentracing.Tracer,
    85  	reloader *reloader.Reloader,
    86  	comp component.Component,
    87  	conf sidecarConfig,
    88  	grpcLogOpts []grpc_logging.Option,
    89  	tagOpts []tags.Option,
    90  ) error {
    91  	httpConfContentYaml, err := conf.prometheus.httpClient.Content()
    92  	if err != nil {
    93  		return errors.Wrap(err, "getting http client config")
    94  	}
    95  	httpClientConfig, err := httpconfig.NewClientConfigFromYAML(httpConfContentYaml)
    96  	if err != nil {
    97  		return errors.Wrap(err, "parsing http config YAML")
    98  	}
    99  
   100  	httpClient, err := httpconfig.NewHTTPClient(*httpClientConfig, "thanos-sidecar")
   101  	if err != nil {
   102  		return errors.Wrap(err, "Improper http client config")
   103  	}
   104  
   105  	reloader.SetHttpClient(*httpClient)
   106  
   107  	var m = &promMetadata{
   108  		promURL: conf.prometheus.url,
   109  
   110  		// Start out with the full time range. The shipper will constrain it later.
   111  		// TODO(fabxc): minimum timestamp is never adjusted if shipping is disabled.
   112  		mint: conf.limitMinTime.PrometheusTimestamp(),
   113  		maxt: math.MaxInt64,
   114  
   115  		limitMinTime: conf.limitMinTime,
   116  		client:       promclient.NewWithTracingClient(logger, httpClient, "thanos-sidecar"),
   117  	}
   118  
   119  	confContentYaml, err := conf.objStore.Content()
   120  	if err != nil {
   121  		return errors.Wrap(err, "getting object store config")
   122  	}
   123  
   124  	var uploads = true
   125  	if len(confContentYaml) == 0 {
   126  		level.Info(logger).Log("msg", "no supported bucket was configured, uploads will be disabled")
   127  		uploads = false
   128  	}
   129  
   130  	grpcProbe := prober.NewGRPC()
   131  	httpProbe := prober.NewHTTP()
   132  	statusProber := prober.Combine(
   133  		httpProbe,
   134  		grpcProbe,
   135  		prober.NewInstrumentation(comp, logger, extprom.WrapRegistererWithPrefix("thanos_", reg)),
   136  	)
   137  
   138  	srv := httpserver.New(logger, reg, comp, httpProbe,
   139  		httpserver.WithListen(conf.http.bindAddress),
   140  		httpserver.WithGracePeriod(time.Duration(conf.http.gracePeriod)),
   141  		httpserver.WithTLSConfig(conf.http.tlsConfig),
   142  	)
   143  
   144  	g.Add(func() error {
   145  		statusProber.Healthy()
   146  
   147  		return srv.ListenAndServe()
   148  	}, func(err error) {
   149  		statusProber.NotReady(err)
   150  		defer statusProber.NotHealthy(err)
   151  
   152  		srv.Shutdown(err)
   153  	})
   154  
   155  	// Setup all the concurrent groups.
   156  	{
   157  		promUp := promauto.With(reg).NewGauge(prometheus.GaugeOpts{
   158  			Name: "thanos_sidecar_prometheus_up",
   159  			Help: "Boolean indicator whether the sidecar can reach its Prometheus peer.",
   160  		})
   161  
   162  		ctx, cancel := context.WithCancel(context.Background())
   163  		g.Add(func() error {
   164  			// Only check Prometheus's flags when upload is enabled.
   165  			if uploads {
   166  				// Check prometheus's flags to ensure same sidecar flags.
   167  				if err := validatePrometheus(ctx, m.client, logger, conf.shipper.ignoreBlockSize, m); err != nil {
   168  					return errors.Wrap(err, "validate Prometheus flags")
   169  				}
   170  			}
   171  
   172  			// We retry infinitely until we reach and fetch BuildVersion from our Prometheus.
   173  			err := runutil.Retry(2*time.Second, ctx.Done(), func() error {
   174  				if err := m.BuildVersion(ctx); err != nil {
   175  					level.Warn(logger).Log(
   176  						"msg", "failed to fetch prometheus version. Is Prometheus running? Retrying",
   177  						"err", err,
   178  					)
   179  					return err
   180  				}
   181  
   182  				level.Info(logger).Log(
   183  					"msg", "successfully loaded prometheus version",
   184  				)
   185  				return nil
   186  			})
   187  			if err != nil {
   188  				return errors.Wrap(err, "failed to get prometheus version")
   189  			}
   190  
   191  			// Blocking query of external labels before joining as a Source Peer into gossip.
   192  			// We retry infinitely until we reach and fetch labels from our Prometheus.
   193  			err = runutil.Retry(2*time.Second, ctx.Done(), func() error {
   194  				if err := m.UpdateLabels(ctx); err != nil {
   195  					level.Warn(logger).Log(
   196  						"msg", "failed to fetch initial external labels. Is Prometheus running? Retrying",
   197  						"err", err,
   198  					)
   199  					promUp.Set(0)
   200  					statusProber.NotReady(err)
   201  					return err
   202  				}
   203  
   204  				level.Info(logger).Log(
   205  					"msg", "successfully loaded prometheus external labels",
   206  					"external_labels", m.Labels().String(),
   207  				)
   208  				promUp.Set(1)
   209  				statusProber.Ready()
   210  				return nil
   211  			})
   212  			if err != nil {
   213  				return errors.Wrap(err, "initial external labels query")
   214  			}
   215  
   216  			if len(m.Labels()) == 0 {
   217  				return errors.New("no external labels configured on Prometheus server, uniquely identifying external labels must be configured; see https://thanos.io/tip/thanos/storage.md#external-labels for details.")
   218  			}
   219  
   220  			// Periodically query the Prometheus config. We use this as a heartbeat as well as for updating
   221  			// the external labels we apply.
   222  			return runutil.Repeat(conf.prometheus.getConfigInterval, ctx.Done(), func() error {
   223  				iterCtx, iterCancel := context.WithTimeout(context.Background(), conf.prometheus.getConfigTimeout)
   224  				defer iterCancel()
   225  
   226  				if err := m.UpdateLabels(iterCtx); err != nil {
   227  					level.Warn(logger).Log("msg", "heartbeat failed", "err", err)
   228  					promUp.Set(0)
   229  					statusProber.NotReady(err)
   230  				} else {
   231  					promUp.Set(1)
   232  					statusProber.Ready()
   233  				}
   234  
   235  				return nil
   236  			})
   237  		}, func(error) {
   238  			cancel()
   239  		})
   240  	}
   241  	{
   242  		ctx, cancel := context.WithCancel(context.Background())
   243  		g.Add(func() error {
   244  			return reloader.Watch(ctx)
   245  		}, func(error) {
   246  			cancel()
   247  		})
   248  	}
   249  	{
   250  		c := promclient.NewWithTracingClient(logger, httpClient, httpconfig.ThanosUserAgent)
   251  
   252  		promStore, err := store.NewPrometheusStore(logger, reg, c, conf.prometheus.url, component.Sidecar, m.Labels, m.Timestamps, m.Version)
   253  		if err != nil {
   254  			return errors.Wrap(err, "create Prometheus store")
   255  		}
   256  
   257  		tlsCfg, err := tls.NewServerConfig(log.With(logger, "protocol", "gRPC"),
   258  			conf.grpc.tlsSrvCert, conf.grpc.tlsSrvKey, conf.grpc.tlsSrvClientCA)
   259  		if err != nil {
   260  			return errors.Wrap(err, "setup gRPC server")
   261  		}
   262  
   263  		exemplarSrv := exemplars.NewPrometheus(conf.prometheus.url, c, m.Labels)
   264  
   265  		infoSrv := info.NewInfoServer(
   266  			component.Sidecar.String(),
   267  			info.WithLabelSetFunc(func() []labelpb.ZLabelSet {
   268  				return promStore.LabelSet()
   269  			}),
   270  			info.WithStoreInfoFunc(func() *infopb.StoreInfo {
   271  				if httpProbe.IsReady() {
   272  					mint, maxt := promStore.Timestamps()
   273  					return &infopb.StoreInfo{
   274  						MinTime:                      mint,
   275  						MaxTime:                      maxt,
   276  						SupportsSharding:             true,
   277  						SupportsWithoutReplicaLabels: true,
   278  						TsdbInfos:                    promStore.TSDBInfos(),
   279  					}
   280  				}
   281  				return nil
   282  			}),
   283  			info.WithExemplarsInfoFunc(),
   284  			info.WithRulesInfoFunc(),
   285  			info.WithTargetsInfoFunc(),
   286  			info.WithMetricMetadataInfoFunc(),
   287  		)
   288  
   289  		storeServer := store.NewLimitedStoreServer(store.NewInstrumentedStoreServer(reg, promStore), reg, conf.storeRateLimits)
   290  		s := grpcserver.New(logger, reg, tracer, grpcLogOpts, tagOpts, comp, grpcProbe,
   291  			grpcserver.WithServer(store.RegisterStoreServer(storeServer, logger)),
   292  			grpcserver.WithServer(rules.RegisterRulesServer(rules.NewPrometheus(conf.prometheus.url, c, m.Labels))),
   293  			grpcserver.WithServer(targets.RegisterTargetsServer(targets.NewPrometheus(conf.prometheus.url, c, m.Labels))),
   294  			grpcserver.WithServer(meta.RegisterMetadataServer(meta.NewPrometheus(conf.prometheus.url, c))),
   295  			grpcserver.WithServer(exemplars.RegisterExemplarsServer(exemplarSrv)),
   296  			grpcserver.WithServer(info.RegisterInfoServer(infoSrv)),
   297  			grpcserver.WithListen(conf.grpc.bindAddress),
   298  			grpcserver.WithGracePeriod(conf.grpc.gracePeriod),
   299  			grpcserver.WithMaxConnAge(conf.grpc.maxConnectionAge),
   300  			grpcserver.WithTLSConfig(tlsCfg),
   301  		)
   302  		g.Add(func() error {
   303  			statusProber.Ready()
   304  			return s.ListenAndServe()
   305  		}, func(err error) {
   306  			statusProber.NotReady(err)
   307  			s.Shutdown(err)
   308  		})
   309  	}
   310  
   311  	if uploads {
   312  		// The background shipper continuously scans the data directory and uploads
   313  		// new blocks to Google Cloud Storage or an S3-compatible storage service.
   314  		bkt, err := client.NewBucket(logger, confContentYaml, component.Sidecar.String())
   315  		if err != nil {
   316  			return err
   317  		}
   318  		bkt = objstoretracing.WrapWithTraces(objstore.WrapWithMetrics(bkt, extprom.WrapRegistererWithPrefix("thanos_", reg), bkt.Name()))
   319  
   320  		// Ensure we close up everything properly.
   321  		defer func() {
   322  			if err != nil {
   323  				runutil.CloseWithLogOnErr(logger, bkt, "bucket client")
   324  			}
   325  		}()
   326  
   327  		if err := promclient.IsWALDirAccessible(conf.tsdb.path); err != nil {
   328  			level.Error(logger).Log("err", err)
   329  		}
   330  
   331  		ctx, cancel := context.WithCancel(context.Background())
   332  		g.Add(func() error {
   333  			defer runutil.CloseWithLogOnErr(logger, bkt, "bucket client")
   334  
   335  			promReadyTimeout := conf.prometheus.readyTimeout
   336  			extLabelsCtx, cancel := context.WithTimeout(ctx, promReadyTimeout)
   337  			defer cancel()
   338  
   339  			if err := runutil.Retry(2*time.Second, extLabelsCtx.Done(), func() error {
   340  				if len(m.Labels()) == 0 {
   341  					return errors.New("not uploading as no external labels are configured yet - is Prometheus healthy/reachable?")
   342  				}
   343  				return nil
   344  			}); err != nil {
   345  				return errors.Wrapf(err, "aborting as no external labels found after waiting %s", promReadyTimeout)
   346  			}
   347  
   348  			uploadCompactedFunc := func() bool { return conf.shipper.uploadCompacted }
   349  			s := shipper.New(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource,
   350  				uploadCompactedFunc, conf.shipper.allowOutOfOrderUpload, metadata.HashFunc(conf.shipper.hashFunc))
   351  
   352  			return runutil.Repeat(30*time.Second, ctx.Done(), func() error {
   353  				if uploaded, err := s.Sync(ctx); err != nil {
   354  					level.Warn(logger).Log("err", err, "uploaded", uploaded)
   355  				}
   356  
   357  				minTime, _, err := s.Timestamps()
   358  				if err != nil {
   359  					level.Warn(logger).Log("msg", "reading timestamps failed", "err", err)
   360  					return nil
   361  				}
   362  				m.UpdateTimestamps(minTime, math.MaxInt64)
   363  				return nil
   364  			})
   365  		}, func(error) {
   366  			cancel()
   367  		})
   368  	}
   369  
   370  	level.Info(logger).Log("msg", "starting sidecar")
   371  	return nil
   372  }
   373  
   374  func validatePrometheus(ctx context.Context, client *promclient.Client, logger log.Logger, ignoreBlockSize bool, m *promMetadata) error {
   375  	var (
   376  		flagErr error
   377  		flags   promclient.Flags
   378  	)
   379  
   380  	if err := runutil.Retry(2*time.Second, ctx.Done(), func() error {
   381  		if flags, flagErr = client.ConfiguredFlags(ctx, m.promURL); flagErr != nil && flagErr != promclient.ErrFlagEndpointNotFound {
   382  			level.Warn(logger).Log("msg", "failed to get Prometheus flags. Is Prometheus running? Retrying", "err", flagErr)
   383  			return errors.Wrapf(flagErr, "fetch Prometheus flags")
   384  		}
   385  		return nil
   386  	}); err != nil {
   387  		return errors.Wrapf(err, "fetch Prometheus flags")
   388  	}
   389  
   390  	if flagErr != nil {
   391  		level.Warn(logger).Log("msg", "failed to check Prometheus flags, due to potentially older Prometheus. No extra validation is done.", "err", flagErr)
   392  		return nil
   393  	}
   394  
   395  	// Check if compaction is disabled.
   396  	if flags.TSDBMinTime != flags.TSDBMaxTime {
   397  		if !ignoreBlockSize {
   398  			return errors.Errorf("found that TSDB Max time is %s and Min time is %s. "+
   399  				"Compaction needs to be disabled (storage.tsdb.min-block-duration = storage.tsdb.max-block-duration)", flags.TSDBMaxTime, flags.TSDBMinTime)
   400  		}
   401  		level.Warn(logger).Log("msg", "flag to ignore Prometheus min/max block duration flags differing is being used. If the upload of a 2h block fails and a Prometheus compaction happens that block may be missing from your Thanos bucket storage.")
   402  	}
   403  	// Check if block time is 2h.
   404  	if flags.TSDBMinTime != model.Duration(2*time.Hour) {
   405  		level.Warn(logger).Log("msg", "found that TSDB block time is not 2h. Only 2h block time is recommended.", "block-time", flags.TSDBMinTime)
   406  	}
   407  
   408  	return nil
   409  }
   410  
   411  type promMetadata struct {
   412  	promURL *url.URL
   413  
   414  	mtx          sync.Mutex
   415  	mint         int64
   416  	maxt         int64
   417  	labels       labels.Labels
   418  	promVersion  string
   419  	limitMinTime thanosmodel.TimeOrDurationValue
   420  
   421  	client *promclient.Client
   422  }
   423  
   424  func (s *promMetadata) UpdateLabels(ctx context.Context) error {
   425  	elset, err := s.client.ExternalLabels(ctx, s.promURL)
   426  	if err != nil {
   427  		return err
   428  	}
   429  
   430  	s.mtx.Lock()
   431  	defer s.mtx.Unlock()
   432  
   433  	s.labels = elset
   434  	return nil
   435  }
   436  
   437  func (s *promMetadata) UpdateTimestamps(mint, maxt int64) {
   438  	s.mtx.Lock()
   439  	defer s.mtx.Unlock()
   440  
   441  	if mint < s.limitMinTime.PrometheusTimestamp() {
   442  		mint = s.limitMinTime.PrometheusTimestamp()
   443  	}
   444  
   445  	s.mint = mint
   446  	s.maxt = maxt
   447  }
   448  
   449  func (s *promMetadata) Labels() labels.Labels {
   450  	s.mtx.Lock()
   451  	defer s.mtx.Unlock()
   452  
   453  	return s.labels
   454  }
   455  
   456  func (s *promMetadata) Timestamps() (mint, maxt int64) {
   457  	s.mtx.Lock()
   458  	defer s.mtx.Unlock()
   459  
   460  	return s.mint, s.maxt
   461  }
   462  
   463  func (s *promMetadata) BuildVersion(ctx context.Context) error {
   464  	ver, err := s.client.BuildVersion(ctx, s.promURL)
   465  	if err != nil {
   466  		return err
   467  	}
   468  
   469  	s.mtx.Lock()
   470  	defer s.mtx.Unlock()
   471  
   472  	s.promVersion = ver
   473  	return nil
   474  }
   475  
   476  func (s *promMetadata) Version() string {
   477  	s.mtx.Lock()
   478  	defer s.mtx.Unlock()
   479  
   480  	return s.promVersion
   481  }
   482  
   483  type sidecarConfig struct {
   484  	http            httpConfig
   485  	grpc            grpcConfig
   486  	prometheus      prometheusConfig
   487  	tsdb            tsdbConfig
   488  	reloader        reloaderConfig
   489  	reqLogConfig    *extflag.PathOrContent
   490  	objStore        extflag.PathOrContent
   491  	shipper         shipperConfig
   492  	limitMinTime    thanosmodel.TimeOrDurationValue
   493  	storeRateLimits store.SeriesSelectLimits
   494  }
   495  
   496  func (sc *sidecarConfig) registerFlag(cmd extkingpin.FlagClause) {
   497  	sc.http.registerFlag(cmd)
   498  	sc.grpc.registerFlag(cmd)
   499  	sc.prometheus.registerFlag(cmd)
   500  	sc.tsdb.registerFlag(cmd)
   501  	sc.reloader.registerFlag(cmd)
   502  	sc.reqLogConfig = extkingpin.RegisterRequestLoggingFlags(cmd)
   503  	sc.objStore = *extkingpin.RegisterCommonObjStoreFlags(cmd, "", false)
   504  	sc.shipper.registerFlag(cmd)
   505  	sc.storeRateLimits.RegisterFlags(cmd)
   506  	cmd.Flag("min-time", "Start of time range limit to serve. Thanos sidecar will serve only metrics, which happened later than this value. Option can be a constant time in RFC3339 format or time duration relative to current time, such as -1d or 2h45m. Valid duration units are ms, s, m, h, d, w, y.").
   507  		Default("0000-01-01T00:00:00Z").SetValue(&sc.limitMinTime)
   508  }