github.com/thanos-io/thanos@v0.32.5/cmd/thanos/query.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 package main 5 6 import ( 7 "context" 8 "fmt" 9 "math" 10 "net/http" 11 "strings" 12 "time" 13 14 "google.golang.org/grpc" 15 16 "github.com/go-kit/log" 17 "github.com/go-kit/log/level" 18 grpc_logging "github.com/grpc-ecosystem/go-grpc-middleware/v2/interceptors/logging" 19 "github.com/grpc-ecosystem/go-grpc-middleware/v2/interceptors/tags" 20 "github.com/oklog/run" 21 "github.com/opentracing/opentracing-go" 22 "github.com/pkg/errors" 23 "github.com/prometheus/client_golang/prometheus" 24 "github.com/prometheus/client_golang/prometheus/promauto" 25 "github.com/prometheus/common/route" 26 "github.com/prometheus/prometheus/discovery/file" 27 "github.com/prometheus/prometheus/discovery/targetgroup" 28 "github.com/prometheus/prometheus/model/labels" 29 "github.com/prometheus/prometheus/promql" 30 "github.com/thanos-io/promql-engine/api" 31 32 apiv1 "github.com/thanos-io/thanos/pkg/api/query" 33 "github.com/thanos-io/thanos/pkg/api/query/querypb" 34 "github.com/thanos-io/thanos/pkg/compact/downsample" 35 "github.com/thanos-io/thanos/pkg/component" 36 "github.com/thanos-io/thanos/pkg/discovery/cache" 37 "github.com/thanos-io/thanos/pkg/discovery/dns" 38 "github.com/thanos-io/thanos/pkg/exemplars" 39 "github.com/thanos-io/thanos/pkg/extgrpc" 40 "github.com/thanos-io/thanos/pkg/extgrpc/snappy" 41 "github.com/thanos-io/thanos/pkg/extkingpin" 42 "github.com/thanos-io/thanos/pkg/extprom" 43 extpromhttp "github.com/thanos-io/thanos/pkg/extprom/http" 44 "github.com/thanos-io/thanos/pkg/gate" 45 "github.com/thanos-io/thanos/pkg/info" 46 "github.com/thanos-io/thanos/pkg/info/infopb" 47 "github.com/thanos-io/thanos/pkg/logging" 48 "github.com/thanos-io/thanos/pkg/metadata" 49 "github.com/thanos-io/thanos/pkg/prober" 50 "github.com/thanos-io/thanos/pkg/query" 51 "github.com/thanos-io/thanos/pkg/rules" 52 "github.com/thanos-io/thanos/pkg/runutil" 53 grpcserver "github.com/thanos-io/thanos/pkg/server/grpc" 54 httpserver "github.com/thanos-io/thanos/pkg/server/http" 55 "github.com/thanos-io/thanos/pkg/store" 56 "github.com/thanos-io/thanos/pkg/store/labelpb" 57 "github.com/thanos-io/thanos/pkg/targets" 58 "github.com/thanos-io/thanos/pkg/tenancy" 59 "github.com/thanos-io/thanos/pkg/tls" 60 "github.com/thanos-io/thanos/pkg/ui" 61 ) 62 63 const ( 64 promqlNegativeOffset = "promql-negative-offset" 65 promqlAtModifier = "promql-at-modifier" 66 queryPushdown = "query-pushdown" 67 ) 68 69 type queryMode string 70 71 const ( 72 queryModeLocal queryMode = "local" 73 queryModeDistributed queryMode = "distributed" 74 ) 75 76 // registerQuery registers a query command. 77 func registerQuery(app *extkingpin.App) { 78 comp := component.Query 79 cmd := app.Command(comp.String(), "Query node exposing PromQL enabled Query API with data retrieved from multiple store nodes.") 80 81 httpBindAddr, httpGracePeriod, httpTLSConfig := extkingpin.RegisterHTTPFlags(cmd) 82 83 var grpcServerConfig grpcConfig 84 grpcServerConfig.registerFlag(cmd) 85 86 secure := cmd.Flag("grpc-client-tls-secure", "Use TLS when talking to the gRPC server").Default("false").Bool() 87 skipVerify := cmd.Flag("grpc-client-tls-skip-verify", "Disable TLS certificate verification i.e self signed, signed by fake CA").Default("false").Bool() 88 cert := cmd.Flag("grpc-client-tls-cert", "TLS Certificates to use to identify this client to the server").Default("").String() 89 key := cmd.Flag("grpc-client-tls-key", "TLS Key for the client's certificate").Default("").String() 90 caCert := cmd.Flag("grpc-client-tls-ca", "TLS CA Certificates to use to verify gRPC servers").Default("").String() 91 serverName := cmd.Flag("grpc-client-server-name", "Server name to verify the hostname on the returned gRPC certificates. See https://tools.ietf.org/html/rfc4366#section-3.1").Default("").String() 92 compressionOptions := strings.Join([]string{snappy.Name, compressionNone}, ", ") 93 grpcCompression := cmd.Flag("grpc-compression", "Compression algorithm to use for gRPC requests to other clients. Must be one of: "+compressionOptions).Default(compressionNone).Enum(snappy.Name, compressionNone) 94 95 webRoutePrefix := cmd.Flag("web.route-prefix", "Prefix for API and UI endpoints. This allows thanos UI to be served on a sub-path. Defaults to the value of --web.external-prefix. This option is analogous to --web.route-prefix of Prometheus.").Default("").String() 96 webExternalPrefix := cmd.Flag("web.external-prefix", "Static prefix for all HTML links and redirect URLs in the UI query web interface. Actual endpoints are still served on / or the web.route-prefix. This allows thanos UI to be served behind a reverse proxy that strips a URL sub-path.").Default("").String() 97 webPrefixHeaderName := cmd.Flag("web.prefix-header", "Name of HTTP request header used for dynamic prefixing of UI links and redirects. This option is ignored if web.external-prefix argument is set. Security risk: enable this option only if a reverse proxy in front of thanos is resetting the header. The --web.prefix-header=X-Forwarded-Prefix option can be useful, for example, if Thanos UI is served via Traefik reverse proxy with PathPrefixStrip option enabled, which sends the stripped prefix value in X-Forwarded-Prefix header. This allows thanos UI to be served on a sub-path.").Default("").String() 98 webDisableCORS := cmd.Flag("web.disable-cors", "Whether to disable CORS headers to be set by Thanos. By default Thanos sets CORS headers to be allowed by all.").Default("false").Bool() 99 100 reqLogDecision := cmd.Flag("log.request.decision", "Deprecation Warning - This flag would be soon deprecated, and replaced with `request.logging-config`. Request Logging for logging the start and end of requests. By default this flag is disabled. LogFinishCall: Logs the finish call of the requests. LogStartAndFinishCall: Logs the start and finish call of the requests. NoLogCall: Disable request logging.").Default("").Enum("NoLogCall", "LogFinishCall", "LogStartAndFinishCall", "") 101 102 queryTimeout := extkingpin.ModelDuration(cmd.Flag("query.timeout", "Maximum time to process query by query node."). 103 Default("2m")) 104 105 defaultEngine := cmd.Flag("query.promql-engine", "Default PromQL engine to use.").Default(string(apiv1.PromqlEnginePrometheus)). 106 Enum(string(apiv1.PromqlEnginePrometheus), string(apiv1.PromqlEngineThanos)) 107 108 promqlQueryMode := cmd.Flag("query.mode", "PromQL query mode. One of: local, distributed."). 109 Hidden(). 110 Default(string(queryModeLocal)). 111 Enum(string(queryModeLocal), string(queryModeDistributed)) 112 113 maxConcurrentQueries := cmd.Flag("query.max-concurrent", "Maximum number of queries processed concurrently by query node."). 114 Default("20").Int() 115 116 lookbackDelta := cmd.Flag("query.lookback-delta", "The maximum lookback duration for retrieving metrics during expression evaluations. PromQL always evaluates the query for the certain timestamp (query range timestamps are deduced by step). Since scrape intervals might be different, PromQL looks back for given amount of time to get latest sample. If it exceeds the maximum lookback delta it assumes series is stale and returns none (a gap). This is why lookback delta should be set to at least 2 times of the slowest scrape interval. If unset it will use the promql default of 5m.").Duration() 117 dynamicLookbackDelta := cmd.Flag("query.dynamic-lookback-delta", "Allow for larger lookback duration for queries based on resolution.").Hidden().Default("true").Bool() 118 119 maxConcurrentSelects := cmd.Flag("query.max-concurrent-select", "Maximum number of select requests made concurrently per a query."). 120 Default("4").Int() 121 122 queryConnMetricLabels := cmd.Flag("query.conn-metric.label", "Optional selection of query connection metric labels to be collected from endpoint set"). 123 Default(string(query.ExternalLabels), string(query.StoreType)). 124 Enums(string(query.ExternalLabels), string(query.StoreType)) 125 126 queryReplicaLabels := cmd.Flag("query.replica-label", "Labels to treat as a replica indicator along which data is deduplicated. Still you will be able to query without deduplication using 'dedup=false' parameter. Data includes time series, recording rules, and alerting rules."). 127 Strings() 128 129 instantDefaultMaxSourceResolution := extkingpin.ModelDuration(cmd.Flag("query.instant.default.max_source_resolution", "default value for max_source_resolution for instant queries. If not set, defaults to 0s only taking raw resolution into account. 1h can be a good value if you use instant queries over time ranges that incorporate times outside of your raw-retention.").Default("0s").Hidden()) 130 131 defaultMetadataTimeRange := cmd.Flag("query.metadata.default-time-range", "The default metadata time range duration for retrieving labels through Labels and Series API when the range parameters are not specified. The zero value means range covers the time since the beginning.").Default("0s").Duration() 132 133 selectorLabels := cmd.Flag("selector-label", "Query selector labels that will be exposed in info endpoint (repeated)."). 134 PlaceHolder("<name>=\"<value>\"").Strings() 135 136 endpoints := extkingpin.Addrs(cmd.Flag("endpoint", "Addresses of statically configured Thanos API servers (repeatable). The scheme may be prefixed with 'dns+' or 'dnssrv+' to detect Thanos API servers through respective DNS lookups."). 137 PlaceHolder("<endpoint>")) 138 139 endpointGroups := extkingpin.Addrs(cmd.Flag("endpoint-group", "Experimental: DNS name of statically configured Thanos API server groups (repeatable). Targets resolved from the DNS name will be queried in a round-robin, instead of a fanout manner. This flag should be used when connecting a Thanos Query to HA groups of Thanos components."). 140 PlaceHolder("<endpoint-group>")) 141 142 stores := extkingpin.Addrs(cmd.Flag("store", "Deprecation Warning - This flag is deprecated and replaced with `endpoint`. Addresses of statically configured store API servers (repeatable). The scheme may be prefixed with 'dns+' or 'dnssrv+' to detect store API servers through respective DNS lookups."). 143 PlaceHolder("<store>")) 144 145 // TODO(bwplotka): Hidden because we plan to extract discovery to separate API: https://github.com/thanos-io/thanos/issues/2600. 146 ruleEndpoints := extkingpin.Addrs(cmd.Flag("rule", "Deprecation Warning - This flag is deprecated and replaced with `endpoint`. Experimental: Addresses of statically configured rules API servers (repeatable). The scheme may be prefixed with 'dns+' or 'dnssrv+' to detect rule API servers through respective DNS lookups."). 147 Hidden().PlaceHolder("<rule>")) 148 149 metadataEndpoints := extkingpin.Addrs(cmd.Flag("metadata", "Deprecation Warning - This flag is deprecated and replaced with `endpoint`. Experimental: Addresses of statically configured metadata API servers (repeatable). The scheme may be prefixed with 'dns+' or 'dnssrv+' to detect metadata API servers through respective DNS lookups."). 150 Hidden().PlaceHolder("<metadata>")) 151 152 exemplarEndpoints := extkingpin.Addrs(cmd.Flag("exemplar", "Deprecation Warning - This flag is deprecated and replaced with `endpoint`. Experimental: Addresses of statically configured exemplars API servers (repeatable). The scheme may be prefixed with 'dns+' or 'dnssrv+' to detect exemplars API servers through respective DNS lookups."). 153 Hidden().PlaceHolder("<exemplar>")) 154 155 // TODO(atunik): Hidden because we plan to extract discovery to separate API: https://github.com/thanos-io/thanos/issues/2600. 156 targetEndpoints := extkingpin.Addrs(cmd.Flag("target", "Deprecation Warning - This flag is deprecated and replaced with `endpoint`. Experimental: Addresses of statically configured target API servers (repeatable). The scheme may be prefixed with 'dns+' or 'dnssrv+' to detect target API servers through respective DNS lookups."). 157 Hidden().PlaceHolder("<target>")) 158 159 strictStores := cmd.Flag("store-strict", "Deprecation Warning - This flag is deprecated and replaced with `endpoint-strict`. Addresses of only statically configured store API servers that are always used, even if the health check fails. Useful if you have a caching layer on top."). 160 PlaceHolder("<staticstore>").Strings() 161 162 strictEndpoints := cmd.Flag("endpoint-strict", "Addresses of only statically configured Thanos API servers that are always used, even if the health check fails. Useful if you have a caching layer on top."). 163 PlaceHolder("<staticendpoint>").Strings() 164 165 strictEndpointGroups := extkingpin.Addrs(cmd.Flag("endpoint-group-strict", "Experimental: DNS name of statically configured Thanos API server groups (repeatable) that are always used, even if the health check fails."). 166 PlaceHolder("<endpoint-group-strict>")) 167 168 fileSDFiles := cmd.Flag("store.sd-files", "Path to files that contain addresses of store API servers. The path can be a glob pattern (repeatable)."). 169 PlaceHolder("<path>").Strings() 170 171 fileSDInterval := extkingpin.ModelDuration(cmd.Flag("store.sd-interval", "Refresh interval to re-read file SD files. It is used as a resync fallback."). 172 Default("5m")) 173 174 // TODO(bwplotka): Grab this from TTL at some point. 175 dnsSDInterval := extkingpin.ModelDuration(cmd.Flag("store.sd-dns-interval", "Interval between DNS resolutions."). 176 Default("30s")) 177 178 dnsSDResolver := cmd.Flag("store.sd-dns-resolver", fmt.Sprintf("Resolver to use. Possible options: [%s, %s]", dns.GolangResolverType, dns.MiekgdnsResolverType)). 179 Default(string(dns.MiekgdnsResolverType)).Hidden().String() 180 181 unhealthyStoreTimeout := extkingpin.ModelDuration(cmd.Flag("store.unhealthy-timeout", "Timeout before an unhealthy store is cleaned from the store UI page.").Default("5m")) 182 183 endpointInfoTimeout := extkingpin.ModelDuration(cmd.Flag("endpoint.info-timeout", "Timeout of gRPC Info requests.").Default("5s").Hidden()) 184 185 enableAutodownsampling := cmd.Flag("query.auto-downsampling", "Enable automatic adjustment (step / 5) to what source of data should be used in store gateways if no max_source_resolution param is specified."). 186 Default("false").Bool() 187 188 enableQueryPartialResponse := cmd.Flag("query.partial-response", "Enable partial response for queries if no partial_response param is specified. --no-query.partial-response for disabling."). 189 Default("true").Bool() 190 191 enableRulePartialResponse := cmd.Flag("rule.partial-response", "Enable partial response for rules endpoint. --no-rule.partial-response for disabling."). 192 Hidden().Default("true").Bool() 193 194 enableTargetPartialResponse := cmd.Flag("target.partial-response", "Enable partial response for targets endpoint. --no-target.partial-response for disabling."). 195 Hidden().Default("true").Bool() 196 197 enableMetricMetadataPartialResponse := cmd.Flag("metric-metadata.partial-response", "Enable partial response for metric metadata endpoint. --no-metric-metadata.partial-response for disabling."). 198 Hidden().Default("true").Bool() 199 200 activeQueryDir := cmd.Flag("query.active-query-path", "Directory to log currently active queries in the queries.active file.").Default("").String() 201 202 featureList := cmd.Flag("enable-feature", "Comma separated experimental feature names to enable.The current list of features is "+queryPushdown+".").Default("").Strings() 203 204 enableExemplarPartialResponse := cmd.Flag("exemplar.partial-response", "Enable partial response for exemplar endpoint. --no-exemplar.partial-response for disabling."). 205 Hidden().Default("true").Bool() 206 207 defaultEvaluationInterval := extkingpin.ModelDuration(cmd.Flag("query.default-evaluation-interval", "Set default evaluation interval for sub queries.").Default("1m")) 208 209 defaultRangeQueryStep := extkingpin.ModelDuration(cmd.Flag("query.default-step", "Set default step for range queries. Default step is only used when step is not set in UI. In such cases, Thanos UI will use default step to calculate resolution (resolution = max(rangeSeconds / 250, defaultStep)). This will not work from Grafana, but Grafana has __step variable which can be used."). 210 Default("1s")) 211 212 storeResponseTimeout := extkingpin.ModelDuration(cmd.Flag("store.response-timeout", "If a Store doesn't send any data in this specified duration then a Store will be ignored and partial data will be returned if it's enabled. 0 disables timeout.").Default("0ms")) 213 reqLogConfig := extkingpin.RegisterRequestLoggingFlags(cmd) 214 215 alertQueryURL := cmd.Flag("alert.query-url", "The external Thanos Query URL that would be set in all alerts 'Source' field.").String() 216 grpcProxyStrategy := cmd.Flag("grpc.proxy-strategy", "Strategy to use when proxying Series requests to leaf nodes. Hidden and only used for testing, will be removed after lazy becomes the default.").Default(string(store.EagerRetrieval)).Hidden().Enum(string(store.EagerRetrieval), string(store.LazyRetrieval)) 217 218 queryTelemetryDurationQuantiles := cmd.Flag("query.telemetry.request-duration-seconds-quantiles", "The quantiles for exporting metrics about the request duration quantiles.").Default("0.1", "0.25", "0.75", "1.25", "1.75", "2.5", "3", "5", "10").Float64List() 219 queryTelemetrySamplesQuantiles := cmd.Flag("query.telemetry.request-samples-quantiles", "The quantiles for exporting metrics about the samples count quantiles.").Default("100", "1000", "10000", "100000", "1000000").Float64List() 220 queryTelemetrySeriesQuantiles := cmd.Flag("query.telemetry.request-series-seconds-quantiles", "The quantiles for exporting metrics about the series count quantiles.").Default("10", "100", "1000", "10000", "100000").Float64List() 221 222 tenantHeader := cmd.Flag("query.tenant-header", "HTTP header to determine tenant.").Default(tenancy.DefaultTenantHeader).Hidden().String() 223 defaultTenant := cmd.Flag("query.default-tenant", "Name of the default tenant.").Default(tenancy.DefaultTenant).Hidden().String() 224 tenantCertField := cmd.Flag("query.tenant-certificate-field", "Use TLS client's certificate field to determine tenant for write requests. Must be one of "+tenancy.CertificateFieldOrganization+", "+tenancy.CertificateFieldOrganizationalUnit+" or "+tenancy.CertificateFieldCommonName+". This setting will cause the query.tenant-header flag value to be ignored.").Default("").Hidden().Enum("", tenancy.CertificateFieldOrganization, tenancy.CertificateFieldOrganizationalUnit, tenancy.CertificateFieldCommonName) 225 226 var storeRateLimits store.SeriesSelectLimits 227 storeRateLimits.RegisterFlags(cmd) 228 229 cmd.Setup(func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ <-chan struct{}, debugLogging bool) error { 230 selectorLset, err := parseFlagLabels(*selectorLabels) 231 if err != nil { 232 return errors.Wrap(err, "parse federation labels") 233 } 234 235 var enableQueryPushdown bool 236 for _, feature := range *featureList { 237 if feature == queryPushdown { 238 enableQueryPushdown = true 239 } 240 if feature == promqlAtModifier { 241 level.Warn(logger).Log("msg", "This option for --enable-feature is now permanently enabled and therefore a no-op.", "option", promqlAtModifier) 242 } 243 if feature == promqlNegativeOffset { 244 level.Warn(logger).Log("msg", "This option for --enable-feature is now permanently enabled and therefore a no-op.", "option", promqlNegativeOffset) 245 } 246 } 247 248 httpLogOpts, err := logging.ParseHTTPOptions(*reqLogDecision, reqLogConfig) 249 if err != nil { 250 return errors.Wrap(err, "error while parsing config for request logging") 251 } 252 253 tagOpts, grpcLogOpts, err := logging.ParsegRPCOptions(*reqLogDecision, reqLogConfig) 254 if err != nil { 255 return errors.Wrap(err, "error while parsing config for request logging") 256 } 257 258 var fileSD *file.Discovery 259 if len(*fileSDFiles) > 0 { 260 conf := &file.SDConfig{ 261 Files: *fileSDFiles, 262 RefreshInterval: *fileSDInterval, 263 } 264 fileSD = file.NewDiscovery(conf, logger) 265 } 266 267 if *webRoutePrefix == "" { 268 *webRoutePrefix = *webExternalPrefix 269 } 270 271 if *webRoutePrefix != *webExternalPrefix { 272 level.Warn(logger).Log("msg", "different values for --web.route-prefix and --web.external-prefix detected, web UI may not work without a reverse-proxy.") 273 } 274 275 return runQuery( 276 g, 277 logger, 278 debugLogging, 279 reg, 280 tracer, 281 httpLogOpts, 282 grpcLogOpts, 283 tagOpts, 284 grpcServerConfig, 285 *grpcCompression, 286 *secure, 287 *skipVerify, 288 *cert, 289 *key, 290 *caCert, 291 *serverName, 292 *httpBindAddr, 293 *httpTLSConfig, 294 time.Duration(*httpGracePeriod), 295 *webRoutePrefix, 296 *webExternalPrefix, 297 *webPrefixHeaderName, 298 *maxConcurrentQueries, 299 *maxConcurrentSelects, 300 time.Duration(*defaultRangeQueryStep), 301 time.Duration(*queryTimeout), 302 *lookbackDelta, 303 *dynamicLookbackDelta, 304 time.Duration(*defaultEvaluationInterval), 305 time.Duration(*storeResponseTimeout), 306 *queryConnMetricLabels, 307 *queryReplicaLabels, 308 selectorLset, 309 getFlagsMap(cmd.Flags()), 310 *endpoints, 311 *endpointGroups, 312 *stores, 313 *ruleEndpoints, 314 *targetEndpoints, 315 *metadataEndpoints, 316 *exemplarEndpoints, 317 *enableAutodownsampling, 318 *enableQueryPartialResponse, 319 *enableRulePartialResponse, 320 *enableTargetPartialResponse, 321 *enableMetricMetadataPartialResponse, 322 *enableExemplarPartialResponse, 323 *activeQueryDir, 324 fileSD, 325 time.Duration(*dnsSDInterval), 326 *dnsSDResolver, 327 time.Duration(*unhealthyStoreTimeout), 328 time.Duration(*endpointInfoTimeout), 329 time.Duration(*instantDefaultMaxSourceResolution), 330 *defaultMetadataTimeRange, 331 *strictStores, 332 *strictEndpoints, 333 *strictEndpointGroups, 334 *webDisableCORS, 335 enableQueryPushdown, 336 *alertQueryURL, 337 *grpcProxyStrategy, 338 component.Query, 339 *queryTelemetryDurationQuantiles, 340 *queryTelemetrySamplesQuantiles, 341 *queryTelemetrySeriesQuantiles, 342 *defaultEngine, 343 storeRateLimits, 344 queryMode(*promqlQueryMode), 345 *tenantHeader, 346 *defaultTenant, 347 *tenantCertField, 348 ) 349 }) 350 } 351 352 // runQuery starts a server that exposes PromQL Query API. It is responsible for querying configured 353 // store nodes, merging and duplicating the data to satisfy user query. 354 func runQuery( 355 g *run.Group, 356 logger log.Logger, 357 debugLogging bool, 358 reg *prometheus.Registry, 359 tracer opentracing.Tracer, 360 httpLogOpts []logging.Option, 361 grpcLogOpts []grpc_logging.Option, 362 tagOpts []tags.Option, 363 grpcServerConfig grpcConfig, 364 grpcCompression string, 365 secure bool, 366 skipVerify bool, 367 cert string, 368 key string, 369 caCert string, 370 serverName string, 371 httpBindAddr string, 372 httpTLSConfig string, 373 httpGracePeriod time.Duration, 374 webRoutePrefix string, 375 webExternalPrefix string, 376 webPrefixHeaderName string, 377 maxConcurrentQueries int, 378 maxConcurrentSelects int, 379 defaultRangeQueryStep time.Duration, 380 queryTimeout time.Duration, 381 lookbackDelta time.Duration, 382 dynamicLookbackDelta bool, 383 defaultEvaluationInterval time.Duration, 384 storeResponseTimeout time.Duration, 385 queryConnMetricLabels []string, 386 queryReplicaLabels []string, 387 selectorLset labels.Labels, 388 flagsMap map[string]string, 389 endpointAddrs []string, 390 endpointGroupAddrs []string, 391 storeAddrs []string, 392 ruleAddrs []string, 393 targetAddrs []string, 394 metadataAddrs []string, 395 exemplarAddrs []string, 396 enableAutodownsampling bool, 397 enableQueryPartialResponse bool, 398 enableRulePartialResponse bool, 399 enableTargetPartialResponse bool, 400 enableMetricMetadataPartialResponse bool, 401 enableExemplarPartialResponse bool, 402 activeQueryDir string, 403 fileSD *file.Discovery, 404 dnsSDInterval time.Duration, 405 dnsSDResolver string, 406 unhealthyStoreTimeout time.Duration, 407 endpointInfoTimeout time.Duration, 408 instantDefaultMaxSourceResolution time.Duration, 409 defaultMetadataTimeRange time.Duration, 410 strictStores []string, 411 strictEndpoints []string, 412 strictEndpointGroups []string, 413 disableCORS bool, 414 enableQueryPushdown bool, 415 alertQueryURL string, 416 grpcProxyStrategy string, 417 comp component.Component, 418 queryTelemetryDurationQuantiles []float64, 419 queryTelemetrySamplesQuantiles []float64, 420 queryTelemetrySeriesQuantiles []float64, 421 defaultEngine string, 422 storeRateLimits store.SeriesSelectLimits, 423 queryMode queryMode, 424 tenantHeader string, 425 defaultTenant string, 426 tenantCertField string, 427 ) error { 428 if alertQueryURL == "" { 429 lastColon := strings.LastIndex(httpBindAddr, ":") 430 if lastColon != -1 { 431 alertQueryURL = fmt.Sprintf("http://localhost:%s", httpBindAddr[lastColon+1:]) 432 } 433 // NOTE(GiedriusS): default is set in config.ts. 434 } 435 // TODO(bplotka in PR #513 review): Move arguments into struct. 436 duplicatedStores := promauto.With(reg).NewCounter(prometheus.CounterOpts{ 437 Name: "thanos_query_duplicated_store_addresses_total", 438 Help: "The number of times a duplicated store addresses is detected from the different configs in query", 439 }) 440 441 dialOpts, err := extgrpc.StoreClientGRPCOpts(logger, reg, tracer, secure, skipVerify, cert, key, caCert, serverName) 442 if err != nil { 443 return errors.Wrap(err, "building gRPC client") 444 } 445 if grpcCompression != compressionNone { 446 dialOpts = append(dialOpts, grpc.WithDefaultCallOptions(grpc.UseCompressor(grpcCompression))) 447 } 448 449 fileSDCache := cache.New() 450 dnsStoreProvider := dns.NewProvider( 451 logger, 452 extprom.WrapRegistererWithPrefix("thanos_query_store_apis_", reg), 453 dns.ResolverType(dnsSDResolver), 454 ) 455 456 for _, store := range strictStores { 457 if dns.IsDynamicNode(store) { 458 return errors.Errorf("%s is a dynamically specified store i.e. it uses SD and that is not permitted under strict mode. Use --store for this", store) 459 } 460 } 461 462 for _, endpoint := range strictEndpoints { 463 if dns.IsDynamicNode(endpoint) { 464 return errors.Errorf("%s is a dynamically specified endpoint i.e. it uses SD and that is not permitted under strict mode. Use --endpoint for this", endpoint) 465 } 466 } 467 468 dnsEndpointProvider := dns.NewProvider( 469 logger, 470 extprom.WrapRegistererWithPrefix("thanos_query_endpoints_", reg), 471 dns.ResolverType(dnsSDResolver), 472 ) 473 474 dnsRuleProvider := dns.NewProvider( 475 logger, 476 extprom.WrapRegistererWithPrefix("thanos_query_rule_apis_", reg), 477 dns.ResolverType(dnsSDResolver), 478 ) 479 480 dnsTargetProvider := dns.NewProvider( 481 logger, 482 extprom.WrapRegistererWithPrefix("thanos_query_target_apis_", reg), 483 dns.ResolverType(dnsSDResolver), 484 ) 485 486 dnsMetadataProvider := dns.NewProvider( 487 logger, 488 extprom.WrapRegistererWithPrefix("thanos_query_metadata_apis_", reg), 489 dns.ResolverType(dnsSDResolver), 490 ) 491 492 dnsExemplarProvider := dns.NewProvider( 493 logger, 494 extprom.WrapRegistererWithPrefix("thanos_query_exemplar_apis_", reg), 495 dns.ResolverType(dnsSDResolver), 496 ) 497 498 options := []store.ProxyStoreOption{} 499 if debugLogging { 500 options = append(options, store.WithProxyStoreDebugLogging()) 501 } 502 503 var ( 504 endpoints = query.NewEndpointSet( 505 time.Now, 506 logger, 507 reg, 508 func() (specs []*query.GRPCEndpointSpec) { 509 // Add strict & static nodes. 510 for _, addr := range strictStores { 511 specs = append(specs, query.NewGRPCEndpointSpec(addr, true)) 512 } 513 514 for _, addr := range strictEndpoints { 515 specs = append(specs, query.NewGRPCEndpointSpec(addr, true)) 516 } 517 518 for _, dnsProvider := range []*dns.Provider{ 519 dnsStoreProvider, 520 dnsRuleProvider, 521 dnsExemplarProvider, 522 dnsMetadataProvider, 523 dnsTargetProvider, 524 dnsEndpointProvider, 525 } { 526 var tmpSpecs []*query.GRPCEndpointSpec 527 528 for _, addr := range dnsProvider.Addresses() { 529 tmpSpecs = append(tmpSpecs, query.NewGRPCEndpointSpec(addr, false)) 530 } 531 tmpSpecs = removeDuplicateEndpointSpecs(logger, duplicatedStores, tmpSpecs) 532 specs = append(specs, tmpSpecs...) 533 } 534 535 for _, eg := range endpointGroupAddrs { 536 addr := fmt.Sprintf("dns:///%s", eg) 537 spec := query.NewGRPCEndpointSpec(addr, false, extgrpc.EndpointGroupGRPCOpts()...) 538 specs = append(specs, spec) 539 } 540 541 for _, eg := range strictEndpointGroups { 542 addr := fmt.Sprintf("dns:///%s", eg) 543 spec := query.NewGRPCEndpointSpec(addr, true, extgrpc.EndpointGroupGRPCOpts()...) 544 specs = append(specs, spec) 545 } 546 547 return specs 548 }, 549 dialOpts, 550 unhealthyStoreTimeout, 551 endpointInfoTimeout, 552 queryConnMetricLabels..., 553 ) 554 proxy = store.NewProxyStore(logger, reg, endpoints.GetStoreClients, component.Query, selectorLset, storeResponseTimeout, store.RetrievalStrategy(grpcProxyStrategy), options...) 555 rulesProxy = rules.NewProxy(logger, endpoints.GetRulesClients) 556 targetsProxy = targets.NewProxy(logger, endpoints.GetTargetsClients) 557 metadataProxy = metadata.NewProxy(logger, endpoints.GetMetricMetadataClients) 558 exemplarsProxy = exemplars.NewProxy(logger, endpoints.GetExemplarsStores, selectorLset) 559 queryableCreator = query.NewQueryableCreator( 560 logger, 561 extprom.WrapRegistererWithPrefix("thanos_query_", reg), 562 proxy, 563 maxConcurrentSelects, 564 queryTimeout, 565 ) 566 ) 567 568 // Periodically update the store set with the addresses we see in our cluster. 569 { 570 ctx, cancel := context.WithCancel(context.Background()) 571 g.Add(func() error { 572 return runutil.Repeat(5*time.Second, ctx.Done(), func() error { 573 endpoints.Update(ctx) 574 return nil 575 }) 576 }, func(error) { 577 cancel() 578 endpoints.Close() 579 }) 580 } 581 582 // Run File Service Discovery and update the store set when the files are modified. 583 if fileSD != nil { 584 var fileSDUpdates chan []*targetgroup.Group 585 ctxRun, cancelRun := context.WithCancel(context.Background()) 586 587 fileSDUpdates = make(chan []*targetgroup.Group) 588 589 g.Add(func() error { 590 fileSD.Run(ctxRun, fileSDUpdates) 591 return nil 592 }, func(error) { 593 cancelRun() 594 }) 595 596 ctxUpdate, cancelUpdate := context.WithCancel(context.Background()) 597 g.Add(func() error { 598 for { 599 select { 600 case update := <-fileSDUpdates: 601 // Discoverers sometimes send nil updates so need to check for it to avoid panics. 602 if update == nil { 603 continue 604 } 605 fileSDCache.Update(update) 606 endpoints.Update(ctxUpdate) 607 608 if err := dnsStoreProvider.Resolve(ctxUpdate, append(fileSDCache.Addresses(), storeAddrs...)); err != nil { 609 level.Error(logger).Log("msg", "failed to resolve addresses for storeAPIs", "err", err) 610 } 611 612 // Rules apis do not support file service discovery as of now. 613 case <-ctxUpdate.Done(): 614 return nil 615 } 616 } 617 }, func(error) { 618 cancelUpdate() 619 }) 620 } 621 // Periodically update the addresses from static flags and file SD by resolving them using DNS SD if necessary. 622 { 623 ctx, cancel := context.WithCancel(context.Background()) 624 g.Add(func() error { 625 return runutil.Repeat(dnsSDInterval, ctx.Done(), func() error { 626 resolveCtx, resolveCancel := context.WithTimeout(ctx, dnsSDInterval) 627 defer resolveCancel() 628 if err := dnsStoreProvider.Resolve(resolveCtx, append(fileSDCache.Addresses(), storeAddrs...)); err != nil { 629 level.Error(logger).Log("msg", "failed to resolve addresses for storeAPIs", "err", err) 630 } 631 if err := dnsRuleProvider.Resolve(resolveCtx, ruleAddrs); err != nil { 632 level.Error(logger).Log("msg", "failed to resolve addresses for rulesAPIs", "err", err) 633 } 634 if err := dnsTargetProvider.Resolve(ctx, targetAddrs); err != nil { 635 level.Error(logger).Log("msg", "failed to resolve addresses for targetsAPIs", "err", err) 636 } 637 if err := dnsMetadataProvider.Resolve(resolveCtx, metadataAddrs); err != nil { 638 level.Error(logger).Log("msg", "failed to resolve addresses for metadataAPIs", "err", err) 639 } 640 if err := dnsExemplarProvider.Resolve(resolveCtx, exemplarAddrs); err != nil { 641 level.Error(logger).Log("msg", "failed to resolve addresses for exemplarsAPI", "err", err) 642 } 643 if err := dnsEndpointProvider.Resolve(resolveCtx, endpointAddrs); err != nil { 644 level.Error(logger).Log("msg", "failed to resolve addresses passed using endpoint flag", "err", err) 645 646 } 647 return nil 648 }) 649 }, func(error) { 650 cancel() 651 }) 652 } 653 654 grpcProbe := prober.NewGRPC() 655 httpProbe := prober.NewHTTP() 656 statusProber := prober.Combine( 657 httpProbe, 658 grpcProbe, 659 prober.NewInstrumentation(comp, logger, extprom.WrapRegistererWithPrefix("thanos_", reg)), 660 ) 661 662 engineOpts := promql.EngineOpts{ 663 Logger: logger, 664 Reg: reg, 665 // TODO(bwplotka): Expose this as a flag: https://github.com/thanos-io/thanos/issues/703. 666 MaxSamples: math.MaxInt32, 667 Timeout: queryTimeout, 668 LookbackDelta: lookbackDelta, 669 NoStepSubqueryIntervalFn: func(int64) int64 { 670 return defaultEvaluationInterval.Milliseconds() 671 }, 672 EnableNegativeOffset: true, 673 EnableAtModifier: true, 674 } 675 676 // An active query tracker will be added only if the user specifies a non-default path. 677 // Otherwise, the nil active query tracker from existing engine options will be used. 678 if activeQueryDir != "" { 679 engineOpts.ActiveQueryTracker = promql.NewActiveQueryTracker(activeQueryDir, maxConcurrentQueries, logger) 680 } 681 682 var remoteEngineEndpoints api.RemoteEndpoints 683 if queryMode != queryModeLocal { 684 remoteEngineEndpoints = query.NewRemoteEndpoints(logger, endpoints.GetQueryAPIClients, query.Opts{ 685 AutoDownsample: enableAutodownsampling, 686 ReplicaLabels: queryReplicaLabels, 687 Timeout: queryTimeout, 688 EnablePartialResponse: enableQueryPartialResponse, 689 }) 690 } 691 692 engineFactory := apiv1.NewQueryEngineFactory( 693 engineOpts, 694 remoteEngineEndpoints, 695 ) 696 697 lookbackDeltaCreator := LookbackDeltaFactory(engineOpts, dynamicLookbackDelta) 698 699 // Start query API + UI HTTP server. 700 { 701 router := route.New() 702 703 // RoutePrefix must always start with '/'. 704 webRoutePrefix = "/" + strings.Trim(webRoutePrefix, "/") 705 706 // Redirect from / to /webRoutePrefix. 707 if webRoutePrefix != "/" { 708 router.Get("/", func(w http.ResponseWriter, r *http.Request) { 709 http.Redirect(w, r, webRoutePrefix+"/graph", http.StatusFound) 710 }) 711 router.Get(webRoutePrefix, func(w http.ResponseWriter, r *http.Request) { 712 http.Redirect(w, r, webRoutePrefix+"/graph", http.StatusFound) 713 }) 714 router = router.WithPrefix(webRoutePrefix) 715 } 716 717 // Configure Request Logging for HTTP calls. 718 logMiddleware := logging.NewHTTPServerMiddleware(logger, httpLogOpts...) 719 720 ins := extpromhttp.NewInstrumentationMiddleware(reg, nil) 721 // TODO(bplotka in PR #513 review): pass all flags, not only the flags needed by prefix rewriting. 722 ui.NewQueryUI(logger, endpoints, webExternalPrefix, webPrefixHeaderName, alertQueryURL).Register(router, ins) 723 724 api := apiv1.NewQueryAPI( 725 logger, 726 endpoints.GetEndpointStatus, 727 engineFactory, 728 apiv1.PromqlEngineType(defaultEngine), 729 lookbackDeltaCreator, 730 queryableCreator, 731 // NOTE: Will share the same replica label as the query for now. 732 rules.NewGRPCClientWithDedup(rulesProxy, queryReplicaLabels), 733 targets.NewGRPCClientWithDedup(targetsProxy, queryReplicaLabels), 734 metadata.NewGRPCClient(metadataProxy), 735 exemplars.NewGRPCClientWithDedup(exemplarsProxy, queryReplicaLabels), 736 enableAutodownsampling, 737 enableQueryPartialResponse, 738 enableRulePartialResponse, 739 enableTargetPartialResponse, 740 enableMetricMetadataPartialResponse, 741 enableExemplarPartialResponse, 742 enableQueryPushdown, 743 queryReplicaLabels, 744 flagsMap, 745 defaultRangeQueryStep, 746 instantDefaultMaxSourceResolution, 747 defaultMetadataTimeRange, 748 disableCORS, 749 gate.New( 750 extprom.WrapRegistererWithPrefix("thanos_query_concurrent_", reg), 751 maxConcurrentQueries, 752 gate.Queries, 753 ), 754 store.NewSeriesStatsAggregatorFactory( 755 reg, 756 queryTelemetryDurationQuantiles, 757 queryTelemetrySamplesQuantiles, 758 queryTelemetrySeriesQuantiles, 759 ), 760 reg, 761 tenantHeader, 762 defaultTenant, 763 tenantCertField, 764 ) 765 766 api.Register(router.WithPrefix("/api/v1"), tracer, logger, ins, logMiddleware) 767 768 srv := httpserver.New(logger, reg, comp, httpProbe, 769 httpserver.WithListen(httpBindAddr), 770 httpserver.WithGracePeriod(httpGracePeriod), 771 httpserver.WithTLSConfig(httpTLSConfig), 772 ) 773 srv.Handle("/", router) 774 775 g.Add(func() error { 776 statusProber.Healthy() 777 778 return srv.ListenAndServe() 779 }, func(err error) { 780 statusProber.NotReady(err) 781 defer statusProber.NotHealthy(err) 782 783 srv.Shutdown(err) 784 }) 785 } 786 // Start query (proxy) gRPC StoreAPI. 787 { 788 tlsCfg, err := tls.NewServerConfig(log.With(logger, "protocol", "gRPC"), grpcServerConfig.tlsSrvCert, grpcServerConfig.tlsSrvKey, grpcServerConfig.tlsSrvClientCA) 789 if err != nil { 790 return errors.Wrap(err, "setup gRPC server") 791 } 792 793 infoSrv := info.NewInfoServer( 794 component.Query.String(), 795 info.WithLabelSetFunc(func() []labelpb.ZLabelSet { return proxy.LabelSet() }), 796 info.WithStoreInfoFunc(func() *infopb.StoreInfo { 797 if httpProbe.IsReady() { 798 mint, maxt := proxy.TimeRange() 799 return &infopb.StoreInfo{ 800 MinTime: mint, 801 MaxTime: maxt, 802 SupportsSharding: true, 803 SupportsWithoutReplicaLabels: true, 804 TsdbInfos: proxy.TSDBInfos(), 805 } 806 } 807 return nil 808 }), 809 info.WithExemplarsInfoFunc(), 810 info.WithRulesInfoFunc(), 811 info.WithMetricMetadataInfoFunc(), 812 info.WithTargetsInfoFunc(), 813 info.WithQueryAPIInfoFunc(), 814 ) 815 816 defaultEngineType := querypb.EngineType(querypb.EngineType_value[defaultEngine]) 817 grpcAPI := apiv1.NewGRPCAPI(time.Now, queryReplicaLabels, queryableCreator, engineFactory, defaultEngineType, lookbackDeltaCreator, instantDefaultMaxSourceResolution) 818 storeServer := store.NewLimitedStoreServer(store.NewInstrumentedStoreServer(reg, proxy), reg, storeRateLimits) 819 s := grpcserver.New(logger, reg, tracer, grpcLogOpts, tagOpts, comp, grpcProbe, 820 grpcserver.WithServer(apiv1.RegisterQueryServer(grpcAPI)), 821 grpcserver.WithServer(store.RegisterStoreServer(storeServer, logger)), 822 grpcserver.WithServer(rules.RegisterRulesServer(rulesProxy)), 823 grpcserver.WithServer(targets.RegisterTargetsServer(targetsProxy)), 824 grpcserver.WithServer(metadata.RegisterMetadataServer(metadataProxy)), 825 grpcserver.WithServer(exemplars.RegisterExemplarsServer(exemplarsProxy)), 826 grpcserver.WithServer(info.RegisterInfoServer(infoSrv)), 827 grpcserver.WithListen(grpcServerConfig.bindAddress), 828 grpcserver.WithGracePeriod(grpcServerConfig.gracePeriod), 829 grpcserver.WithMaxConnAge(grpcServerConfig.maxConnectionAge), 830 grpcserver.WithTLSConfig(tlsCfg), 831 ) 832 833 g.Add(func() error { 834 statusProber.Ready() 835 return s.ListenAndServe() 836 }, func(error) { 837 statusProber.NotReady(err) 838 s.Shutdown(err) 839 }) 840 } 841 842 level.Info(logger).Log("msg", "starting query node") 843 return nil 844 } 845 846 func removeDuplicateEndpointSpecs(logger log.Logger, duplicatedStores prometheus.Counter, specs []*query.GRPCEndpointSpec) []*query.GRPCEndpointSpec { 847 set := make(map[string]*query.GRPCEndpointSpec) 848 for _, spec := range specs { 849 addr := spec.Addr() 850 if _, ok := set[addr]; ok { 851 level.Warn(logger).Log("msg", "Duplicate store address is provided", "addr", addr) 852 duplicatedStores.Inc() 853 } 854 set[addr] = spec 855 } 856 deduplicated := make([]*query.GRPCEndpointSpec, 0, len(set)) 857 for _, value := range set { 858 deduplicated = append(deduplicated, value) 859 } 860 return deduplicated 861 } 862 863 // LookbackDeltaFactory creates from 1 to 3 lookback deltas depending on 864 // dynamicLookbackDelta and eo.LookbackDelta and returns a function 865 // that returns appropriate lookback delta for given maxSourceResolutionMillis. 866 func LookbackDeltaFactory( 867 eo promql.EngineOpts, 868 dynamicLookbackDelta bool, 869 ) func(int64) time.Duration { 870 resolutions := []int64{downsample.ResLevel0} 871 if dynamicLookbackDelta { 872 resolutions = []int64{downsample.ResLevel0, downsample.ResLevel1, downsample.ResLevel2} 873 } 874 var ( 875 lds = make([]time.Duration, len(resolutions)) 876 ld = eo.LookbackDelta.Milliseconds() 877 ) 878 879 lookbackDelta := eo.LookbackDelta 880 for i, r := range resolutions { 881 if ld < r { 882 lookbackDelta = time.Duration(r) * time.Millisecond 883 } 884 885 lds[i] = lookbackDelta 886 } 887 return func(maxSourceResolutionMillis int64) time.Duration { 888 for i := len(resolutions) - 1; i >= 1; i-- { 889 left := resolutions[i-1] 890 if resolutions[i-1] < ld { 891 left = ld 892 } 893 if left < maxSourceResolutionMillis { 894 return lds[i] 895 } 896 } 897 return lds[0] 898 } 899 }