github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/cortex/cortex.go (about) 1 package cortex 2 3 import ( 4 "bytes" 5 "context" 6 "flag" 7 "fmt" 8 "net/http" 9 "os" 10 "reflect" 11 "strings" 12 13 "github.com/go-kit/log" 14 "github.com/go-kit/log/level" 15 "github.com/grafana/dskit/flagext" 16 "github.com/grafana/dskit/grpcutil" 17 "github.com/grafana/dskit/kv/memberlist" 18 "github.com/grafana/dskit/modules" 19 "github.com/grafana/dskit/ring" 20 "github.com/grafana/dskit/runtimeconfig" 21 "github.com/grafana/dskit/services" 22 "github.com/pkg/errors" 23 "github.com/prometheus/client_golang/prometheus" 24 "github.com/prometheus/prometheus/promql" 25 prom_storage "github.com/prometheus/prometheus/storage" 26 "github.com/weaveworks/common/server" 27 "github.com/weaveworks/common/signals" 28 "google.golang.org/grpc/health/grpc_health_v1" 29 "gopkg.in/yaml.v2" 30 31 "github.com/cortexproject/cortex/pkg/alertmanager" 32 "github.com/cortexproject/cortex/pkg/alertmanager/alertstore" 33 "github.com/cortexproject/cortex/pkg/api" 34 "github.com/cortexproject/cortex/pkg/chunk" 35 "github.com/cortexproject/cortex/pkg/chunk/encoding" 36 "github.com/cortexproject/cortex/pkg/chunk/purger" 37 "github.com/cortexproject/cortex/pkg/chunk/storage" 38 chunk_util "github.com/cortexproject/cortex/pkg/chunk/util" 39 "github.com/cortexproject/cortex/pkg/compactor" 40 "github.com/cortexproject/cortex/pkg/configs" 41 configAPI "github.com/cortexproject/cortex/pkg/configs/api" 42 "github.com/cortexproject/cortex/pkg/configs/db" 43 "github.com/cortexproject/cortex/pkg/cortexpb" 44 "github.com/cortexproject/cortex/pkg/distributor" 45 "github.com/cortexproject/cortex/pkg/flusher" 46 "github.com/cortexproject/cortex/pkg/frontend" 47 frontendv1 "github.com/cortexproject/cortex/pkg/frontend/v1" 48 "github.com/cortexproject/cortex/pkg/ingester" 49 "github.com/cortexproject/cortex/pkg/ingester/client" 50 "github.com/cortexproject/cortex/pkg/querier" 51 "github.com/cortexproject/cortex/pkg/querier/queryrange" 52 "github.com/cortexproject/cortex/pkg/querier/tenantfederation" 53 querier_worker "github.com/cortexproject/cortex/pkg/querier/worker" 54 "github.com/cortexproject/cortex/pkg/ruler" 55 "github.com/cortexproject/cortex/pkg/ruler/rulestore" 56 "github.com/cortexproject/cortex/pkg/scheduler" 57 "github.com/cortexproject/cortex/pkg/storage/tsdb" 58 "github.com/cortexproject/cortex/pkg/storegateway" 59 "github.com/cortexproject/cortex/pkg/tenant" 60 "github.com/cortexproject/cortex/pkg/util" 61 "github.com/cortexproject/cortex/pkg/util/fakeauth" 62 util_log "github.com/cortexproject/cortex/pkg/util/log" 63 "github.com/cortexproject/cortex/pkg/util/process" 64 "github.com/cortexproject/cortex/pkg/util/validation" 65 ) 66 67 var ( 68 errInvalidHTTPPrefix = errors.New("HTTP prefix should be empty or start with /") 69 ) 70 71 // The design pattern for Cortex is a series of config objects, which are 72 // registered for command line flags, and then a series of components that 73 // are instantiated and composed. Some rules of thumb: 74 // - Config types should only contain 'simple' types (ints, strings, urls etc). 75 // - Flag validation should be done by the flag; use a flag.Value where 76 // appropriate. 77 // - Config types should map 1:1 with a component type. 78 // - Config types should define flags with a common prefix. 79 // - It's fine to nest configs within configs, but this should match the 80 // nesting of components within components. 81 // - Limit as much is possible sharing of configuration between config types. 82 // Where necessary, use a pointer for this - avoid repetition. 83 // - Where a nesting of components its not obvious, it's fine to pass 84 // references to other components constructors to compose them. 85 // - First argument for a components constructor should be its matching config 86 // object. 87 88 // Config is the root config for Cortex. 89 type Config struct { 90 Target flagext.StringSliceCSV `yaml:"target"` 91 AuthEnabled bool `yaml:"auth_enabled"` 92 PrintConfig bool `yaml:"-"` 93 HTTPPrefix string `yaml:"http_prefix"` 94 95 API api.Config `yaml:"api"` 96 Server server.Config `yaml:"server"` 97 Distributor distributor.Config `yaml:"distributor"` 98 Querier querier.Config `yaml:"querier"` 99 IngesterClient client.Config `yaml:"ingester_client"` 100 Ingester ingester.Config `yaml:"ingester"` 101 Flusher flusher.Config `yaml:"flusher"` 102 Storage storage.Config `yaml:"storage"` 103 ChunkStore chunk.StoreConfig `yaml:"chunk_store"` 104 Schema chunk.SchemaConfig `yaml:"schema" doc:"hidden"` // Doc generation tool doesn't support it because part of the SchemaConfig doesn't support CLI flags (needs manual documentation) 105 LimitsConfig validation.Limits `yaml:"limits"` 106 Prealloc cortexpb.PreallocConfig `yaml:"prealloc" doc:"hidden"` 107 Worker querier_worker.Config `yaml:"frontend_worker"` 108 Frontend frontend.CombinedFrontendConfig `yaml:"frontend"` 109 QueryRange queryrange.Config `yaml:"query_range"` 110 TableManager chunk.TableManagerConfig `yaml:"table_manager"` 111 Encoding encoding.Config `yaml:"-"` // No yaml for this, it only works with flags. 112 BlocksStorage tsdb.BlocksStorageConfig `yaml:"blocks_storage"` 113 Compactor compactor.Config `yaml:"compactor"` 114 StoreGateway storegateway.Config `yaml:"store_gateway"` 115 PurgerConfig purger.Config `yaml:"purger"` 116 TenantFederation tenantfederation.Config `yaml:"tenant_federation"` 117 118 Ruler ruler.Config `yaml:"ruler"` 119 RulerStorage rulestore.Config `yaml:"ruler_storage"` 120 Configs configs.Config `yaml:"configs"` 121 Alertmanager alertmanager.MultitenantAlertmanagerConfig `yaml:"alertmanager"` 122 AlertmanagerStorage alertstore.Config `yaml:"alertmanager_storage"` 123 RuntimeConfig runtimeconfig.Config `yaml:"runtime_config"` 124 MemberlistKV memberlist.KVConfig `yaml:"memberlist"` 125 QueryScheduler scheduler.Config `yaml:"query_scheduler"` 126 } 127 128 // RegisterFlags registers flag. 129 func (c *Config) RegisterFlags(f *flag.FlagSet) { 130 c.Server.MetricsNamespace = "cortex" 131 c.Server.ExcludeRequestInLog = true 132 133 // Set the default module list to 'all' 134 c.Target = []string{All} 135 136 f.Var(&c.Target, "target", "Comma-separated list of Cortex modules to load. "+ 137 "The alias 'all' can be used in the list to load a number of core modules and will enable single-binary mode. "+ 138 "Use '-modules' command line flag to get a list of available modules, and to see which modules are included in 'all'.") 139 140 f.BoolVar(&c.AuthEnabled, "auth.enabled", true, "Set to false to disable auth.") 141 f.BoolVar(&c.PrintConfig, "print.config", false, "Print the config and exit.") 142 f.StringVar(&c.HTTPPrefix, "http.prefix", "/api/prom", "HTTP path prefix for Cortex API.") 143 144 c.API.RegisterFlags(f) 145 c.registerServerFlagsWithChangedDefaultValues(f) 146 c.Distributor.RegisterFlags(f) 147 c.Querier.RegisterFlags(f) 148 c.IngesterClient.RegisterFlags(f) 149 c.Ingester.RegisterFlags(f) 150 c.Flusher.RegisterFlags(f) 151 c.Storage.RegisterFlags(f) 152 c.ChunkStore.RegisterFlags(f) 153 c.Schema.RegisterFlags(f) 154 c.LimitsConfig.RegisterFlags(f) 155 c.Prealloc.RegisterFlags(f) 156 c.Worker.RegisterFlags(f) 157 c.Frontend.RegisterFlags(f) 158 c.QueryRange.RegisterFlags(f) 159 c.TableManager.RegisterFlags(f) 160 c.Encoding.RegisterFlags(f) 161 c.BlocksStorage.RegisterFlags(f) 162 c.Compactor.RegisterFlags(f) 163 c.StoreGateway.RegisterFlags(f) 164 c.PurgerConfig.RegisterFlags(f) 165 c.TenantFederation.RegisterFlags(f) 166 167 c.Ruler.RegisterFlags(f) 168 c.RulerStorage.RegisterFlags(f) 169 c.Configs.RegisterFlags(f) 170 c.Alertmanager.RegisterFlags(f) 171 c.AlertmanagerStorage.RegisterFlags(f) 172 c.RuntimeConfig.RegisterFlags(f) 173 c.MemberlistKV.RegisterFlags(f) 174 c.QueryScheduler.RegisterFlags(f) 175 176 // These don't seem to have a home. 177 f.IntVar(&chunk_util.QueryParallelism, "querier.query-parallelism", 100, "Max subqueries run in parallel per higher-level query.") 178 } 179 180 // Validate the cortex config and returns an error if the validation 181 // doesn't pass 182 func (c *Config) Validate(log log.Logger) error { 183 if err := c.validateYAMLEmptyNodes(); err != nil { 184 return err 185 } 186 187 if c.HTTPPrefix != "" && !strings.HasPrefix(c.HTTPPrefix, "/") { 188 return errInvalidHTTPPrefix 189 } 190 191 if err := c.Schema.Validate(); err != nil { 192 return errors.Wrap(err, "invalid schema config") 193 } 194 if err := c.Encoding.Validate(); err != nil { 195 return errors.Wrap(err, "invalid encoding config") 196 } 197 if err := c.Storage.Validate(); err != nil { 198 return errors.Wrap(err, "invalid storage config") 199 } 200 if err := c.ChunkStore.Validate(log); err != nil { 201 return errors.Wrap(err, "invalid chunk store config") 202 } 203 if err := c.RulerStorage.Validate(); err != nil { 204 return errors.Wrap(err, "invalid rulestore config") 205 } 206 if err := c.Ruler.Validate(c.LimitsConfig, log); err != nil { 207 return errors.Wrap(err, "invalid ruler config") 208 } 209 if err := c.BlocksStorage.Validate(); err != nil { 210 return errors.Wrap(err, "invalid TSDB config") 211 } 212 if err := c.LimitsConfig.Validate(c.Distributor.ShardByAllLabels); err != nil { 213 return errors.Wrap(err, "invalid limits config") 214 } 215 if err := c.Distributor.Validate(c.LimitsConfig); err != nil { 216 return errors.Wrap(err, "invalid distributor config") 217 } 218 if err := c.Querier.Validate(); err != nil { 219 return errors.Wrap(err, "invalid querier config") 220 } 221 if err := c.IngesterClient.Validate(log); err != nil { 222 return errors.Wrap(err, "invalid ingester_client config") 223 } 224 if err := c.Worker.Validate(log); err != nil { 225 return errors.Wrap(err, "invalid frontend_worker config") 226 } 227 if err := c.QueryRange.Validate(); err != nil { 228 return errors.Wrap(err, "invalid query_range config") 229 } 230 if err := c.TableManager.Validate(); err != nil { 231 return errors.Wrap(err, "invalid table-manager config") 232 } 233 if err := c.StoreGateway.Validate(c.LimitsConfig); err != nil { 234 return errors.Wrap(err, "invalid store-gateway config") 235 } 236 if err := c.Compactor.Validate(); err != nil { 237 return errors.Wrap(err, "invalid compactor config") 238 } 239 if err := c.AlertmanagerStorage.Validate(); err != nil { 240 return errors.Wrap(err, "invalid alertmanager storage config") 241 } 242 if err := c.Alertmanager.Validate(c.AlertmanagerStorage); err != nil { 243 return errors.Wrap(err, "invalid alertmanager config") 244 } 245 246 if c.Storage.Engine == storage.StorageEngineBlocks && c.Querier.SecondStoreEngine != storage.StorageEngineChunks && len(c.Schema.Configs) > 0 { 247 level.Warn(log).Log("schema configuration is not used by the blocks storage engine, and will have no effect") 248 } 249 250 return nil 251 } 252 253 func (c *Config) isModuleEnabled(m string) bool { 254 return util.StringsContain(c.Target, m) 255 } 256 257 // validateYAMLEmptyNodes ensure that no empty node has been specified in the YAML config file. 258 // When an empty node is defined in YAML, the YAML parser sets the whole struct to its zero value 259 // and so we loose all default values. It's very difficult to detect this case for the user, so we 260 // try to prevent it (on the root level) with this custom validation. 261 func (c *Config) validateYAMLEmptyNodes() error { 262 defaults := Config{} 263 flagext.DefaultValues(&defaults) 264 265 defStruct := reflect.ValueOf(defaults) 266 cfgStruct := reflect.ValueOf(*c) 267 268 // We expect all structs are the exact same. This check should never fail. 269 if cfgStruct.NumField() != defStruct.NumField() { 270 return errors.New("unable to validate configuration because of mismatching internal config data structure") 271 } 272 273 for i := 0; i < cfgStruct.NumField(); i++ { 274 // If the struct has been reset due to empty YAML value and the zero struct value 275 // doesn't match the default one, then we should warn the user about the issue. 276 if cfgStruct.Field(i).Kind() == reflect.Struct && cfgStruct.Field(i).IsZero() && !defStruct.Field(i).IsZero() { 277 return fmt.Errorf("the %s configuration in YAML has been specified as an empty YAML node", cfgStruct.Type().Field(i).Name) 278 } 279 } 280 281 return nil 282 } 283 284 func (c *Config) registerServerFlagsWithChangedDefaultValues(fs *flag.FlagSet) { 285 throwaway := flag.NewFlagSet("throwaway", flag.PanicOnError) 286 287 // Register to throwaway flags first. Default values are remembered during registration and cannot be changed, 288 // but we can take values from throwaway flag set and reregister into supplied flags with new default values. 289 c.Server.RegisterFlags(throwaway) 290 291 throwaway.VisitAll(func(f *flag.Flag) { 292 // Ignore errors when setting new values. We have a test to verify that it works. 293 switch f.Name { 294 case "server.grpc.keepalive.min-time-between-pings": 295 _ = f.Value.Set("10s") 296 297 case "server.grpc.keepalive.ping-without-stream-allowed": 298 _ = f.Value.Set("true") 299 } 300 301 fs.Var(f.Value, f.Name, f.Usage) 302 }) 303 } 304 305 // Cortex is the root datastructure for Cortex. 306 type Cortex struct { 307 Cfg Config 308 309 // set during initialization 310 ServiceMap map[string]services.Service 311 ModuleManager *modules.Manager 312 313 API *api.API 314 Server *server.Server 315 Ring *ring.Ring 316 TenantLimits validation.TenantLimits 317 Overrides *validation.Overrides 318 Distributor *distributor.Distributor 319 Ingester *ingester.Ingester 320 Flusher *flusher.Flusher 321 Store chunk.Store 322 DeletesStore *purger.DeleteStore 323 Frontend *frontendv1.Frontend 324 TableManager *chunk.TableManager 325 RuntimeConfig *runtimeconfig.Manager 326 Purger *purger.Purger 327 TombstonesLoader *purger.TombstonesLoader 328 QuerierQueryable prom_storage.SampleAndChunkQueryable 329 ExemplarQueryable prom_storage.ExemplarQueryable 330 QuerierEngine *promql.Engine 331 QueryFrontendTripperware queryrange.Tripperware 332 333 Ruler *ruler.Ruler 334 RulerStorage rulestore.RuleStore 335 ConfigAPI *configAPI.API 336 ConfigDB db.DB 337 Alertmanager *alertmanager.MultitenantAlertmanager 338 Compactor *compactor.Compactor 339 StoreGateway *storegateway.StoreGateway 340 MemberlistKV *memberlist.KVInitService 341 342 // Queryables that the querier should use to query the long 343 // term storage. It depends on the storage engine used. 344 StoreQueryables []querier.QueryableWithFilter 345 } 346 347 // New makes a new Cortex. 348 func New(cfg Config) (*Cortex, error) { 349 if cfg.PrintConfig { 350 if err := yaml.NewEncoder(os.Stdout).Encode(&cfg); err != nil { 351 fmt.Println("Error encoding config:", err) 352 } 353 os.Exit(0) 354 } 355 356 // Swap out the default resolver to support multiple tenant IDs separated by a '|' 357 if cfg.TenantFederation.Enabled { 358 util_log.WarnExperimentalUse("tenant-federation") 359 tenant.WithDefaultResolver(tenant.NewMultiResolver()) 360 } 361 362 // Don't check auth header on TransferChunks, as we weren't originally 363 // sending it and this could cause transfers to fail on update. 364 cfg.API.HTTPAuthMiddleware = fakeauth.SetupAuthMiddleware(&cfg.Server, cfg.AuthEnabled, 365 // Also don't check auth for these gRPC methods, since single call is used for multiple users (or no user like health check). 366 []string{ 367 "/grpc.health.v1.Health/Check", 368 "/cortex.Ingester/TransferChunks", 369 "/frontend.Frontend/Process", 370 "/frontend.Frontend/NotifyClientShutdown", 371 "/schedulerpb.SchedulerForFrontend/FrontendLoop", 372 "/schedulerpb.SchedulerForQuerier/QuerierLoop", 373 "/schedulerpb.SchedulerForQuerier/NotifyQuerierShutdown", 374 }) 375 376 cortex := &Cortex{ 377 Cfg: cfg, 378 } 379 380 cortex.setupThanosTracing() 381 382 if err := cortex.setupModuleManager(); err != nil { 383 return nil, err 384 } 385 386 return cortex, nil 387 } 388 389 // setupThanosTracing appends a gRPC middleware used to inject our tracer into the custom 390 // context used by Thanos, in order to get Thanos spans correctly attached to our traces. 391 func (t *Cortex) setupThanosTracing() { 392 t.Cfg.Server.GRPCMiddleware = append(t.Cfg.Server.GRPCMiddleware, ThanosTracerUnaryInterceptor) 393 t.Cfg.Server.GRPCStreamMiddleware = append(t.Cfg.Server.GRPCStreamMiddleware, ThanosTracerStreamInterceptor) 394 } 395 396 // Run starts Cortex running, and blocks until a Cortex stops. 397 func (t *Cortex) Run() error { 398 // Register custom process metrics. 399 if c, err := process.NewProcessCollector(); err == nil { 400 prometheus.MustRegister(c) 401 } else { 402 level.Warn(util_log.Logger).Log("msg", "skipped registration of custom process metrics collector", "err", err) 403 } 404 405 for _, module := range t.Cfg.Target { 406 if !t.ModuleManager.IsUserVisibleModule(module) { 407 level.Warn(util_log.Logger).Log("msg", "selected target is an internal module, is this intended?", "target", module) 408 } 409 } 410 411 var err error 412 t.ServiceMap, err = t.ModuleManager.InitModuleServices(t.Cfg.Target...) 413 if err != nil { 414 return err 415 } 416 417 t.API.RegisterServiceMapHandler(http.HandlerFunc(t.servicesHandler)) 418 419 // get all services, create service manager and tell it to start 420 servs := []services.Service(nil) 421 for _, s := range t.ServiceMap { 422 servs = append(servs, s) 423 } 424 425 sm, err := services.NewManager(servs...) 426 if err != nil { 427 return err 428 } 429 430 // before starting servers, register /ready handler and gRPC health check service. 431 // It should reflect entire Cortex. 432 t.Server.HTTP.Path("/ready").Handler(t.readyHandler(sm)) 433 grpc_health_v1.RegisterHealthServer(t.Server.GRPC, grpcutil.NewHealthCheck(sm)) 434 435 // Let's listen for events from this manager, and log them. 436 healthy := func() { level.Info(util_log.Logger).Log("msg", "Cortex started") } 437 stopped := func() { level.Info(util_log.Logger).Log("msg", "Cortex stopped") } 438 serviceFailed := func(service services.Service) { 439 // if any service fails, stop entire Cortex 440 sm.StopAsync() 441 442 // let's find out which module failed 443 for m, s := range t.ServiceMap { 444 if s == service { 445 if service.FailureCase() == modules.ErrStopProcess { 446 level.Info(util_log.Logger).Log("msg", "received stop signal via return error", "module", m, "err", service.FailureCase()) 447 } else { 448 level.Error(util_log.Logger).Log("msg", "module failed", "module", m, "err", service.FailureCase()) 449 } 450 return 451 } 452 } 453 454 level.Error(util_log.Logger).Log("msg", "module failed", "module", "unknown", "err", service.FailureCase()) 455 } 456 457 sm.AddListener(services.NewManagerListener(healthy, stopped, serviceFailed)) 458 459 // Setup signal handler. If signal arrives, we stop the manager, which stops all the services. 460 handler := signals.NewHandler(t.Server.Log) 461 go func() { 462 handler.Loop() 463 sm.StopAsync() 464 }() 465 466 // Start all services. This can really only fail if some service is already 467 // in other state than New, which should not be the case. 468 err = sm.StartAsync(context.Background()) 469 if err == nil { 470 // Wait until service manager stops. It can stop in two ways: 471 // 1) Signal is received and manager is stopped. 472 // 2) Any service fails. 473 err = sm.AwaitStopped(context.Background()) 474 } 475 476 // If there is no error yet (= service manager started and then stopped without problems), 477 // but any service failed, report that failure as an error to caller. 478 if err == nil { 479 if failed := sm.ServicesByState()[services.Failed]; len(failed) > 0 { 480 for _, f := range failed { 481 if f.FailureCase() != modules.ErrStopProcess { 482 // Details were reported via failure listener before 483 err = errors.New("failed services") 484 break 485 } 486 } 487 } 488 } 489 return err 490 } 491 492 func (t *Cortex) readyHandler(sm *services.Manager) http.HandlerFunc { 493 return func(w http.ResponseWriter, r *http.Request) { 494 if !sm.IsHealthy() { 495 msg := bytes.Buffer{} 496 msg.WriteString("Some services are not Running:\n") 497 498 byState := sm.ServicesByState() 499 for st, ls := range byState { 500 msg.WriteString(fmt.Sprintf("%v: %d\n", st, len(ls))) 501 } 502 503 http.Error(w, msg.String(), http.StatusServiceUnavailable) 504 return 505 } 506 507 // Ingester has a special check that makes sure that it was able to register into the ring, 508 // and that all other ring entries are OK too. 509 if t.Ingester != nil { 510 if err := t.Ingester.CheckReady(r.Context()); err != nil { 511 http.Error(w, "Ingester not ready: "+err.Error(), http.StatusServiceUnavailable) 512 return 513 } 514 } 515 516 // Query Frontend has a special check that makes sure that a querier is attached before it signals 517 // itself as ready 518 if t.Frontend != nil { 519 if err := t.Frontend.CheckReady(r.Context()); err != nil { 520 http.Error(w, "Query Frontend not ready: "+err.Error(), http.StatusServiceUnavailable) 521 return 522 } 523 } 524 525 util.WriteTextResponse(w, "ready") 526 } 527 }