github.com/thanos-io/thanos@v0.32.5/cmd/thanos/sidecar.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 package main 5 6 import ( 7 "context" 8 "math" 9 "net/url" 10 "sync" 11 "time" 12 13 extflag "github.com/efficientgo/tools/extkingpin" 14 "github.com/go-kit/log" 15 "github.com/go-kit/log/level" 16 grpc_logging "github.com/grpc-ecosystem/go-grpc-middleware/v2/interceptors/logging" 17 "github.com/grpc-ecosystem/go-grpc-middleware/v2/interceptors/tags" 18 "github.com/oklog/run" 19 "github.com/opentracing/opentracing-go" 20 "github.com/pkg/errors" 21 "github.com/prometheus/client_golang/prometheus" 22 "github.com/prometheus/client_golang/prometheus/promauto" 23 "github.com/prometheus/common/model" 24 "github.com/prometheus/prometheus/model/labels" 25 26 "github.com/thanos-io/objstore" 27 "github.com/thanos-io/objstore/client" 28 objstoretracing "github.com/thanos-io/objstore/tracing/opentracing" 29 30 "github.com/thanos-io/thanos/pkg/block/metadata" 31 "github.com/thanos-io/thanos/pkg/component" 32 "github.com/thanos-io/thanos/pkg/exemplars" 33 "github.com/thanos-io/thanos/pkg/extkingpin" 34 "github.com/thanos-io/thanos/pkg/extprom" 35 "github.com/thanos-io/thanos/pkg/httpconfig" 36 "github.com/thanos-io/thanos/pkg/info" 37 "github.com/thanos-io/thanos/pkg/info/infopb" 38 "github.com/thanos-io/thanos/pkg/logging" 39 meta "github.com/thanos-io/thanos/pkg/metadata" 40 thanosmodel "github.com/thanos-io/thanos/pkg/model" 41 "github.com/thanos-io/thanos/pkg/prober" 42 "github.com/thanos-io/thanos/pkg/promclient" 43 "github.com/thanos-io/thanos/pkg/reloader" 44 "github.com/thanos-io/thanos/pkg/rules" 45 "github.com/thanos-io/thanos/pkg/runutil" 46 grpcserver "github.com/thanos-io/thanos/pkg/server/grpc" 47 httpserver "github.com/thanos-io/thanos/pkg/server/http" 48 "github.com/thanos-io/thanos/pkg/shipper" 49 "github.com/thanos-io/thanos/pkg/store" 50 "github.com/thanos-io/thanos/pkg/store/labelpb" 51 "github.com/thanos-io/thanos/pkg/targets" 52 "github.com/thanos-io/thanos/pkg/tls" 53 ) 54 55 func registerSidecar(app *extkingpin.App) { 56 cmd := app.Command(component.Sidecar.String(), "Sidecar for Prometheus server.") 57 conf := &sidecarConfig{} 58 conf.registerFlag(cmd) 59 cmd.Setup(func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ <-chan struct{}, _ bool) error { 60 tagOpts, grpcLogOpts, err := logging.ParsegRPCOptions("", conf.reqLogConfig) 61 if err != nil { 62 return errors.Wrap(err, "error while parsing config for request logging") 63 } 64 65 rl := reloader.New(log.With(logger, "component", "reloader"), 66 extprom.WrapRegistererWithPrefix("thanos_sidecar_", reg), 67 &reloader.Options{ 68 ReloadURL: reloader.ReloadURLFromBase(conf.prometheus.url), 69 CfgFile: conf.reloader.confFile, 70 CfgOutputFile: conf.reloader.envVarConfFile, 71 WatchedDirs: conf.reloader.ruleDirectories, 72 WatchInterval: conf.reloader.watchInterval, 73 RetryInterval: conf.reloader.retryInterval, 74 }) 75 76 return runSidecar(g, logger, reg, tracer, rl, component.Sidecar, *conf, grpcLogOpts, tagOpts) 77 }) 78 } 79 80 func runSidecar( 81 g *run.Group, 82 logger log.Logger, 83 reg *prometheus.Registry, 84 tracer opentracing.Tracer, 85 reloader *reloader.Reloader, 86 comp component.Component, 87 conf sidecarConfig, 88 grpcLogOpts []grpc_logging.Option, 89 tagOpts []tags.Option, 90 ) error { 91 httpConfContentYaml, err := conf.prometheus.httpClient.Content() 92 if err != nil { 93 return errors.Wrap(err, "getting http client config") 94 } 95 httpClientConfig, err := httpconfig.NewClientConfigFromYAML(httpConfContentYaml) 96 if err != nil { 97 return errors.Wrap(err, "parsing http config YAML") 98 } 99 100 httpClient, err := httpconfig.NewHTTPClient(*httpClientConfig, "thanos-sidecar") 101 if err != nil { 102 return errors.Wrap(err, "Improper http client config") 103 } 104 105 reloader.SetHttpClient(*httpClient) 106 107 var m = &promMetadata{ 108 promURL: conf.prometheus.url, 109 110 // Start out with the full time range. The shipper will constrain it later. 111 // TODO(fabxc): minimum timestamp is never adjusted if shipping is disabled. 112 mint: conf.limitMinTime.PrometheusTimestamp(), 113 maxt: math.MaxInt64, 114 115 limitMinTime: conf.limitMinTime, 116 client: promclient.NewWithTracingClient(logger, httpClient, "thanos-sidecar"), 117 } 118 119 confContentYaml, err := conf.objStore.Content() 120 if err != nil { 121 return errors.Wrap(err, "getting object store config") 122 } 123 124 var uploads = true 125 if len(confContentYaml) == 0 { 126 level.Info(logger).Log("msg", "no supported bucket was configured, uploads will be disabled") 127 uploads = false 128 } 129 130 grpcProbe := prober.NewGRPC() 131 httpProbe := prober.NewHTTP() 132 statusProber := prober.Combine( 133 httpProbe, 134 grpcProbe, 135 prober.NewInstrumentation(comp, logger, extprom.WrapRegistererWithPrefix("thanos_", reg)), 136 ) 137 138 srv := httpserver.New(logger, reg, comp, httpProbe, 139 httpserver.WithListen(conf.http.bindAddress), 140 httpserver.WithGracePeriod(time.Duration(conf.http.gracePeriod)), 141 httpserver.WithTLSConfig(conf.http.tlsConfig), 142 ) 143 144 g.Add(func() error { 145 statusProber.Healthy() 146 147 return srv.ListenAndServe() 148 }, func(err error) { 149 statusProber.NotReady(err) 150 defer statusProber.NotHealthy(err) 151 152 srv.Shutdown(err) 153 }) 154 155 // Setup all the concurrent groups. 156 { 157 promUp := promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 158 Name: "thanos_sidecar_prometheus_up", 159 Help: "Boolean indicator whether the sidecar can reach its Prometheus peer.", 160 }) 161 162 ctx, cancel := context.WithCancel(context.Background()) 163 g.Add(func() error { 164 // Only check Prometheus's flags when upload is enabled. 165 if uploads { 166 // Check prometheus's flags to ensure same sidecar flags. 167 if err := validatePrometheus(ctx, m.client, logger, conf.shipper.ignoreBlockSize, m); err != nil { 168 return errors.Wrap(err, "validate Prometheus flags") 169 } 170 } 171 172 // We retry infinitely until we reach and fetch BuildVersion from our Prometheus. 173 err := runutil.Retry(2*time.Second, ctx.Done(), func() error { 174 if err := m.BuildVersion(ctx); err != nil { 175 level.Warn(logger).Log( 176 "msg", "failed to fetch prometheus version. Is Prometheus running? Retrying", 177 "err", err, 178 ) 179 return err 180 } 181 182 level.Info(logger).Log( 183 "msg", "successfully loaded prometheus version", 184 ) 185 return nil 186 }) 187 if err != nil { 188 return errors.Wrap(err, "failed to get prometheus version") 189 } 190 191 // Blocking query of external labels before joining as a Source Peer into gossip. 192 // We retry infinitely until we reach and fetch labels from our Prometheus. 193 err = runutil.Retry(2*time.Second, ctx.Done(), func() error { 194 if err := m.UpdateLabels(ctx); err != nil { 195 level.Warn(logger).Log( 196 "msg", "failed to fetch initial external labels. Is Prometheus running? Retrying", 197 "err", err, 198 ) 199 promUp.Set(0) 200 statusProber.NotReady(err) 201 return err 202 } 203 204 level.Info(logger).Log( 205 "msg", "successfully loaded prometheus external labels", 206 "external_labels", m.Labels().String(), 207 ) 208 promUp.Set(1) 209 statusProber.Ready() 210 return nil 211 }) 212 if err != nil { 213 return errors.Wrap(err, "initial external labels query") 214 } 215 216 if len(m.Labels()) == 0 { 217 return errors.New("no external labels configured on Prometheus server, uniquely identifying external labels must be configured; see https://thanos.io/tip/thanos/storage.md#external-labels for details.") 218 } 219 220 // Periodically query the Prometheus config. We use this as a heartbeat as well as for updating 221 // the external labels we apply. 222 return runutil.Repeat(conf.prometheus.getConfigInterval, ctx.Done(), func() error { 223 iterCtx, iterCancel := context.WithTimeout(context.Background(), conf.prometheus.getConfigTimeout) 224 defer iterCancel() 225 226 if err := m.UpdateLabels(iterCtx); err != nil { 227 level.Warn(logger).Log("msg", "heartbeat failed", "err", err) 228 promUp.Set(0) 229 statusProber.NotReady(err) 230 } else { 231 promUp.Set(1) 232 statusProber.Ready() 233 } 234 235 return nil 236 }) 237 }, func(error) { 238 cancel() 239 }) 240 } 241 { 242 ctx, cancel := context.WithCancel(context.Background()) 243 g.Add(func() error { 244 return reloader.Watch(ctx) 245 }, func(error) { 246 cancel() 247 }) 248 } 249 { 250 c := promclient.NewWithTracingClient(logger, httpClient, httpconfig.ThanosUserAgent) 251 252 promStore, err := store.NewPrometheusStore(logger, reg, c, conf.prometheus.url, component.Sidecar, m.Labels, m.Timestamps, m.Version) 253 if err != nil { 254 return errors.Wrap(err, "create Prometheus store") 255 } 256 257 tlsCfg, err := tls.NewServerConfig(log.With(logger, "protocol", "gRPC"), 258 conf.grpc.tlsSrvCert, conf.grpc.tlsSrvKey, conf.grpc.tlsSrvClientCA) 259 if err != nil { 260 return errors.Wrap(err, "setup gRPC server") 261 } 262 263 exemplarSrv := exemplars.NewPrometheus(conf.prometheus.url, c, m.Labels) 264 265 infoSrv := info.NewInfoServer( 266 component.Sidecar.String(), 267 info.WithLabelSetFunc(func() []labelpb.ZLabelSet { 268 return promStore.LabelSet() 269 }), 270 info.WithStoreInfoFunc(func() *infopb.StoreInfo { 271 if httpProbe.IsReady() { 272 mint, maxt := promStore.Timestamps() 273 return &infopb.StoreInfo{ 274 MinTime: mint, 275 MaxTime: maxt, 276 SupportsSharding: true, 277 SupportsWithoutReplicaLabels: true, 278 TsdbInfos: promStore.TSDBInfos(), 279 } 280 } 281 return nil 282 }), 283 info.WithExemplarsInfoFunc(), 284 info.WithRulesInfoFunc(), 285 info.WithTargetsInfoFunc(), 286 info.WithMetricMetadataInfoFunc(), 287 ) 288 289 storeServer := store.NewLimitedStoreServer(store.NewInstrumentedStoreServer(reg, promStore), reg, conf.storeRateLimits) 290 s := grpcserver.New(logger, reg, tracer, grpcLogOpts, tagOpts, comp, grpcProbe, 291 grpcserver.WithServer(store.RegisterStoreServer(storeServer, logger)), 292 grpcserver.WithServer(rules.RegisterRulesServer(rules.NewPrometheus(conf.prometheus.url, c, m.Labels))), 293 grpcserver.WithServer(targets.RegisterTargetsServer(targets.NewPrometheus(conf.prometheus.url, c, m.Labels))), 294 grpcserver.WithServer(meta.RegisterMetadataServer(meta.NewPrometheus(conf.prometheus.url, c))), 295 grpcserver.WithServer(exemplars.RegisterExemplarsServer(exemplarSrv)), 296 grpcserver.WithServer(info.RegisterInfoServer(infoSrv)), 297 grpcserver.WithListen(conf.grpc.bindAddress), 298 grpcserver.WithGracePeriod(conf.grpc.gracePeriod), 299 grpcserver.WithMaxConnAge(conf.grpc.maxConnectionAge), 300 grpcserver.WithTLSConfig(tlsCfg), 301 ) 302 g.Add(func() error { 303 statusProber.Ready() 304 return s.ListenAndServe() 305 }, func(err error) { 306 statusProber.NotReady(err) 307 s.Shutdown(err) 308 }) 309 } 310 311 if uploads { 312 // The background shipper continuously scans the data directory and uploads 313 // new blocks to Google Cloud Storage or an S3-compatible storage service. 314 bkt, err := client.NewBucket(logger, confContentYaml, component.Sidecar.String()) 315 if err != nil { 316 return err 317 } 318 bkt = objstoretracing.WrapWithTraces(objstore.WrapWithMetrics(bkt, extprom.WrapRegistererWithPrefix("thanos_", reg), bkt.Name())) 319 320 // Ensure we close up everything properly. 321 defer func() { 322 if err != nil { 323 runutil.CloseWithLogOnErr(logger, bkt, "bucket client") 324 } 325 }() 326 327 if err := promclient.IsWALDirAccessible(conf.tsdb.path); err != nil { 328 level.Error(logger).Log("err", err) 329 } 330 331 ctx, cancel := context.WithCancel(context.Background()) 332 g.Add(func() error { 333 defer runutil.CloseWithLogOnErr(logger, bkt, "bucket client") 334 335 promReadyTimeout := conf.prometheus.readyTimeout 336 extLabelsCtx, cancel := context.WithTimeout(ctx, promReadyTimeout) 337 defer cancel() 338 339 if err := runutil.Retry(2*time.Second, extLabelsCtx.Done(), func() error { 340 if len(m.Labels()) == 0 { 341 return errors.New("not uploading as no external labels are configured yet - is Prometheus healthy/reachable?") 342 } 343 return nil 344 }); err != nil { 345 return errors.Wrapf(err, "aborting as no external labels found after waiting %s", promReadyTimeout) 346 } 347 348 uploadCompactedFunc := func() bool { return conf.shipper.uploadCompacted } 349 s := shipper.New(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource, 350 uploadCompactedFunc, conf.shipper.allowOutOfOrderUpload, metadata.HashFunc(conf.shipper.hashFunc)) 351 352 return runutil.Repeat(30*time.Second, ctx.Done(), func() error { 353 if uploaded, err := s.Sync(ctx); err != nil { 354 level.Warn(logger).Log("err", err, "uploaded", uploaded) 355 } 356 357 minTime, _, err := s.Timestamps() 358 if err != nil { 359 level.Warn(logger).Log("msg", "reading timestamps failed", "err", err) 360 return nil 361 } 362 m.UpdateTimestamps(minTime, math.MaxInt64) 363 return nil 364 }) 365 }, func(error) { 366 cancel() 367 }) 368 } 369 370 level.Info(logger).Log("msg", "starting sidecar") 371 return nil 372 } 373 374 func validatePrometheus(ctx context.Context, client *promclient.Client, logger log.Logger, ignoreBlockSize bool, m *promMetadata) error { 375 var ( 376 flagErr error 377 flags promclient.Flags 378 ) 379 380 if err := runutil.Retry(2*time.Second, ctx.Done(), func() error { 381 if flags, flagErr = client.ConfiguredFlags(ctx, m.promURL); flagErr != nil && flagErr != promclient.ErrFlagEndpointNotFound { 382 level.Warn(logger).Log("msg", "failed to get Prometheus flags. Is Prometheus running? Retrying", "err", flagErr) 383 return errors.Wrapf(flagErr, "fetch Prometheus flags") 384 } 385 return nil 386 }); err != nil { 387 return errors.Wrapf(err, "fetch Prometheus flags") 388 } 389 390 if flagErr != nil { 391 level.Warn(logger).Log("msg", "failed to check Prometheus flags, due to potentially older Prometheus. No extra validation is done.", "err", flagErr) 392 return nil 393 } 394 395 // Check if compaction is disabled. 396 if flags.TSDBMinTime != flags.TSDBMaxTime { 397 if !ignoreBlockSize { 398 return errors.Errorf("found that TSDB Max time is %s and Min time is %s. "+ 399 "Compaction needs to be disabled (storage.tsdb.min-block-duration = storage.tsdb.max-block-duration)", flags.TSDBMaxTime, flags.TSDBMinTime) 400 } 401 level.Warn(logger).Log("msg", "flag to ignore Prometheus min/max block duration flags differing is being used. If the upload of a 2h block fails and a Prometheus compaction happens that block may be missing from your Thanos bucket storage.") 402 } 403 // Check if block time is 2h. 404 if flags.TSDBMinTime != model.Duration(2*time.Hour) { 405 level.Warn(logger).Log("msg", "found that TSDB block time is not 2h. Only 2h block time is recommended.", "block-time", flags.TSDBMinTime) 406 } 407 408 return nil 409 } 410 411 type promMetadata struct { 412 promURL *url.URL 413 414 mtx sync.Mutex 415 mint int64 416 maxt int64 417 labels labels.Labels 418 promVersion string 419 limitMinTime thanosmodel.TimeOrDurationValue 420 421 client *promclient.Client 422 } 423 424 func (s *promMetadata) UpdateLabels(ctx context.Context) error { 425 elset, err := s.client.ExternalLabels(ctx, s.promURL) 426 if err != nil { 427 return err 428 } 429 430 s.mtx.Lock() 431 defer s.mtx.Unlock() 432 433 s.labels = elset 434 return nil 435 } 436 437 func (s *promMetadata) UpdateTimestamps(mint, maxt int64) { 438 s.mtx.Lock() 439 defer s.mtx.Unlock() 440 441 if mint < s.limitMinTime.PrometheusTimestamp() { 442 mint = s.limitMinTime.PrometheusTimestamp() 443 } 444 445 s.mint = mint 446 s.maxt = maxt 447 } 448 449 func (s *promMetadata) Labels() labels.Labels { 450 s.mtx.Lock() 451 defer s.mtx.Unlock() 452 453 return s.labels 454 } 455 456 func (s *promMetadata) Timestamps() (mint, maxt int64) { 457 s.mtx.Lock() 458 defer s.mtx.Unlock() 459 460 return s.mint, s.maxt 461 } 462 463 func (s *promMetadata) BuildVersion(ctx context.Context) error { 464 ver, err := s.client.BuildVersion(ctx, s.promURL) 465 if err != nil { 466 return err 467 } 468 469 s.mtx.Lock() 470 defer s.mtx.Unlock() 471 472 s.promVersion = ver 473 return nil 474 } 475 476 func (s *promMetadata) Version() string { 477 s.mtx.Lock() 478 defer s.mtx.Unlock() 479 480 return s.promVersion 481 } 482 483 type sidecarConfig struct { 484 http httpConfig 485 grpc grpcConfig 486 prometheus prometheusConfig 487 tsdb tsdbConfig 488 reloader reloaderConfig 489 reqLogConfig *extflag.PathOrContent 490 objStore extflag.PathOrContent 491 shipper shipperConfig 492 limitMinTime thanosmodel.TimeOrDurationValue 493 storeRateLimits store.SeriesSelectLimits 494 } 495 496 func (sc *sidecarConfig) registerFlag(cmd extkingpin.FlagClause) { 497 sc.http.registerFlag(cmd) 498 sc.grpc.registerFlag(cmd) 499 sc.prometheus.registerFlag(cmd) 500 sc.tsdb.registerFlag(cmd) 501 sc.reloader.registerFlag(cmd) 502 sc.reqLogConfig = extkingpin.RegisterRequestLoggingFlags(cmd) 503 sc.objStore = *extkingpin.RegisterCommonObjStoreFlags(cmd, "", false) 504 sc.shipper.registerFlag(cmd) 505 sc.storeRateLimits.RegisterFlags(cmd) 506 cmd.Flag("min-time", "Start of time range limit to serve. Thanos sidecar will serve only metrics, which happened later than this value. Option can be a constant time in RFC3339 format or time duration relative to current time, such as -1d or 2h45m. Valid duration units are ms, s, m, h, d, w, y."). 507 Default("0000-01-01T00:00:00Z").SetValue(&sc.limitMinTime) 508 }