github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/frontend/v2/frontend.go (about) 1 package v2 2 3 import ( 4 "context" 5 "flag" 6 "fmt" 7 "math/rand" 8 "net/http" 9 "sync" 10 "time" 11 12 "github.com/go-kit/log" 13 "github.com/go-kit/log/level" 14 "github.com/grafana/dskit/flagext" 15 "github.com/grafana/dskit/grpcclient" 16 "github.com/grafana/dskit/services" 17 "github.com/opentracing/opentracing-go" 18 "github.com/pkg/errors" 19 "github.com/prometheus/client_golang/prometheus" 20 "github.com/prometheus/client_golang/prometheus/promauto" 21 "github.com/weaveworks/common/httpgrpc" 22 "go.uber.org/atomic" 23 24 "github.com/cortexproject/cortex/pkg/frontend/v2/frontendv2pb" 25 "github.com/cortexproject/cortex/pkg/querier/stats" 26 "github.com/cortexproject/cortex/pkg/tenant" 27 "github.com/cortexproject/cortex/pkg/util/httpgrpcutil" 28 ) 29 30 // Config for a Frontend. 31 type Config struct { 32 SchedulerAddress string `yaml:"scheduler_address"` 33 DNSLookupPeriod time.Duration `yaml:"scheduler_dns_lookup_period"` 34 WorkerConcurrency int `yaml:"scheduler_worker_concurrency"` 35 GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config"` 36 37 // Used to find local IP address, that is sent to scheduler and querier-worker. 38 InfNames []string `yaml:"instance_interface_names"` 39 40 // If set, address is not computed from interfaces. 41 Addr string `yaml:"address" doc:"hidden"` 42 Port int `doc:"hidden"` 43 } 44 45 func (cfg *Config) RegisterFlags(f *flag.FlagSet) { 46 f.StringVar(&cfg.SchedulerAddress, "frontend.scheduler-address", "", "DNS hostname used for finding query-schedulers.") 47 f.DurationVar(&cfg.DNSLookupPeriod, "frontend.scheduler-dns-lookup-period", 10*time.Second, "How often to resolve the scheduler-address, in order to look for new query-scheduler instances.") 48 f.IntVar(&cfg.WorkerConcurrency, "frontend.scheduler-worker-concurrency", 5, "Number of concurrent workers forwarding queries to single query-scheduler.") 49 50 cfg.InfNames = []string{"eth0", "en0"} 51 f.Var((*flagext.StringSlice)(&cfg.InfNames), "frontend.instance-interface-names", "Name of network interface to read address from. This address is sent to query-scheduler and querier, which uses it to send the query response back to query-frontend.") 52 f.StringVar(&cfg.Addr, "frontend.instance-addr", "", "IP address to advertise to querier (via scheduler) (resolved via interfaces by default).") 53 f.IntVar(&cfg.Port, "frontend.instance-port", 0, "Port to advertise to querier (via scheduler) (defaults to server.grpc-listen-port).") 54 55 cfg.GRPCClientConfig.RegisterFlagsWithPrefix("frontend.grpc-client-config", f) 56 } 57 58 // Frontend implements GrpcRoundTripper. It queues HTTP requests, 59 // dispatches them to backends via gRPC, and handles retries for requests which failed. 60 type Frontend struct { 61 services.Service 62 63 cfg Config 64 log log.Logger 65 66 lastQueryID atomic.Uint64 67 68 // frontend workers will read from this channel, and send request to scheduler. 69 requestsCh chan *frontendRequest 70 71 schedulerWorkers *frontendSchedulerWorkers 72 requests *requestsInProgress 73 } 74 75 type frontendRequest struct { 76 queryID uint64 77 request *httpgrpc.HTTPRequest 78 userID string 79 statsEnabled bool 80 81 cancel context.CancelFunc 82 83 enqueue chan enqueueResult 84 response chan *frontendv2pb.QueryResultRequest 85 } 86 87 type enqueueStatus int 88 89 const ( 90 // Sent to scheduler successfully, and frontend should wait for response now. 91 waitForResponse enqueueStatus = iota 92 93 // Failed to forward request to scheduler, frontend will try again. 94 failed 95 ) 96 97 type enqueueResult struct { 98 status enqueueStatus 99 100 cancelCh chan<- uint64 // Channel that can be used for request cancellation. If nil, cancellation is not possible. 101 } 102 103 // NewFrontend creates a new frontend. 104 func NewFrontend(cfg Config, log log.Logger, reg prometheus.Registerer) (*Frontend, error) { 105 requestsCh := make(chan *frontendRequest) 106 107 schedulerWorkers, err := newFrontendSchedulerWorkers(cfg, fmt.Sprintf("%s:%d", cfg.Addr, cfg.Port), requestsCh, log) 108 if err != nil { 109 return nil, err 110 } 111 112 f := &Frontend{ 113 cfg: cfg, 114 log: log, 115 requestsCh: requestsCh, 116 schedulerWorkers: schedulerWorkers, 117 requests: newRequestsInProgress(), 118 } 119 // Randomize to avoid getting responses from queries sent before restart, which could lead to mixing results 120 // between different queries. Note that frontend verifies the user, so it cannot leak results between tenants. 121 // This isn't perfect, but better than nothing. 122 f.lastQueryID.Store(rand.Uint64()) 123 124 promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{ 125 Name: "cortex_query_frontend_queries_in_progress", 126 Help: "Number of queries in progress handled by this frontend.", 127 }, func() float64 { 128 return float64(f.requests.count()) 129 }) 130 131 promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{ 132 Name: "cortex_query_frontend_connected_schedulers", 133 Help: "Number of schedulers this frontend is connected to.", 134 }, func() float64 { 135 return float64(f.schedulerWorkers.getWorkersCount()) 136 }) 137 138 f.Service = services.NewIdleService(f.starting, f.stopping) 139 return f, nil 140 } 141 142 func (f *Frontend) starting(ctx context.Context) error { 143 return errors.Wrap(services.StartAndAwaitRunning(ctx, f.schedulerWorkers), "failed to start frontend scheduler workers") 144 } 145 146 func (f *Frontend) stopping(_ error) error { 147 return errors.Wrap(services.StopAndAwaitTerminated(context.Background(), f.schedulerWorkers), "failed to stop frontend scheduler workers") 148 } 149 150 // RoundTripGRPC round trips a proto (instead of a HTTP request). 151 func (f *Frontend) RoundTripGRPC(ctx context.Context, req *httpgrpc.HTTPRequest) (*httpgrpc.HTTPResponse, error) { 152 if s := f.State(); s != services.Running { 153 return nil, fmt.Errorf("frontend not running: %v", s) 154 } 155 156 tenantIDs, err := tenant.TenantIDs(ctx) 157 if err != nil { 158 return nil, err 159 } 160 userID := tenant.JoinTenantIDs(tenantIDs) 161 162 // Propagate trace context in gRPC too - this will be ignored if using HTTP. 163 tracer, span := opentracing.GlobalTracer(), opentracing.SpanFromContext(ctx) 164 if tracer != nil && span != nil { 165 carrier := (*httpgrpcutil.HttpgrpcHeadersCarrier)(req) 166 if err := tracer.Inject(span.Context(), opentracing.HTTPHeaders, carrier); err != nil { 167 return nil, err 168 } 169 } 170 171 ctx, cancel := context.WithCancel(ctx) 172 defer cancel() 173 174 freq := &frontendRequest{ 175 queryID: f.lastQueryID.Inc(), 176 request: req, 177 userID: userID, 178 statsEnabled: stats.IsEnabled(ctx), 179 180 cancel: cancel, 181 182 // Buffer of 1 to ensure response or error can be written to the channel 183 // even if this goroutine goes away due to client context cancellation. 184 enqueue: make(chan enqueueResult, 1), 185 response: make(chan *frontendv2pb.QueryResultRequest, 1), 186 } 187 188 f.requests.put(freq) 189 defer f.requests.delete(freq.queryID) 190 191 retries := f.cfg.WorkerConcurrency + 1 // To make sure we hit at least two different schedulers. 192 193 enqueueAgain: 194 select { 195 case <-ctx.Done(): 196 return nil, ctx.Err() 197 198 case f.requestsCh <- freq: 199 // Enqueued, let's wait for response. 200 } 201 202 var cancelCh chan<- uint64 203 204 select { 205 case <-ctx.Done(): 206 return nil, ctx.Err() 207 208 case enqRes := <-freq.enqueue: 209 if enqRes.status == waitForResponse { 210 cancelCh = enqRes.cancelCh 211 break // go wait for response. 212 } else if enqRes.status == failed { 213 retries-- 214 if retries > 0 { 215 goto enqueueAgain 216 } 217 } 218 219 return nil, httpgrpc.Errorf(http.StatusInternalServerError, "failed to enqueue request") 220 } 221 222 select { 223 case <-ctx.Done(): 224 if cancelCh != nil { 225 select { 226 case cancelCh <- freq.queryID: 227 // cancellation sent. 228 default: 229 // failed to cancel, ignore. 230 } 231 } 232 return nil, ctx.Err() 233 234 case resp := <-freq.response: 235 if stats.ShouldTrackHTTPGRPCResponse(resp.HttpResponse) { 236 stats := stats.FromContext(ctx) 237 stats.Merge(resp.Stats) // Safe if stats is nil. 238 } 239 240 return resp.HttpResponse, nil 241 } 242 } 243 244 func (f *Frontend) QueryResult(ctx context.Context, qrReq *frontendv2pb.QueryResultRequest) (*frontendv2pb.QueryResultResponse, error) { 245 tenantIDs, err := tenant.TenantIDs(ctx) 246 if err != nil { 247 return nil, err 248 } 249 userID := tenant.JoinTenantIDs(tenantIDs) 250 251 req := f.requests.get(qrReq.QueryID) 252 // It is possible that some old response belonging to different user was received, if frontend has restarted. 253 // To avoid leaking query results between users, we verify the user here. 254 // To avoid mixing results from different queries, we randomize queryID counter on start. 255 if req != nil && req.userID == userID { 256 select { 257 case req.response <- qrReq: 258 // Should always be possible, unless QueryResult is called multiple times with the same queryID. 259 default: 260 level.Warn(f.log).Log("msg", "failed to write query result to the response channel", "queryID", qrReq.QueryID, "user", userID) 261 } 262 } 263 264 return &frontendv2pb.QueryResultResponse{}, nil 265 } 266 267 // CheckReady determines if the query frontend is ready. Function parameters/return 268 // chosen to match the same method in the ingester 269 func (f *Frontend) CheckReady(_ context.Context) error { 270 workers := f.schedulerWorkers.getWorkersCount() 271 272 // If frontend is connected to at least one scheduler, we are ready. 273 if workers > 0 { 274 return nil 275 } 276 277 msg := fmt.Sprintf("not ready: number of schedulers this worker is connected to is %d", workers) 278 level.Info(f.log).Log("msg", msg) 279 return errors.New(msg) 280 } 281 282 type requestsInProgress struct { 283 mu sync.Mutex 284 requests map[uint64]*frontendRequest 285 } 286 287 func newRequestsInProgress() *requestsInProgress { 288 return &requestsInProgress{ 289 requests: map[uint64]*frontendRequest{}, 290 } 291 } 292 293 func (r *requestsInProgress) count() int { 294 r.mu.Lock() 295 defer r.mu.Unlock() 296 297 return len(r.requests) 298 } 299 300 func (r *requestsInProgress) put(req *frontendRequest) { 301 r.mu.Lock() 302 defer r.mu.Unlock() 303 304 r.requests[req.queryID] = req 305 } 306 307 func (r *requestsInProgress) delete(queryID uint64) { 308 r.mu.Lock() 309 defer r.mu.Unlock() 310 311 delete(r.requests, queryID) 312 } 313 314 func (r *requestsInProgress) get(queryID uint64) *frontendRequest { 315 r.mu.Lock() 316 defer r.mu.Unlock() 317 318 return r.requests[queryID] 319 }