github.com/thanos-io/thanos@v0.32.5/internal/cortex/frontend/transport/handler.go (about) 1 // Copyright (c) The Cortex Authors. 2 // Licensed under the Apache License 2.0. 3 4 package transport 5 6 import ( 7 "bytes" 8 "context" 9 "errors" 10 "fmt" 11 "io" 12 "io/ioutil" 13 "net/http" 14 "net/url" 15 "strconv" 16 "strings" 17 "syscall" 18 "time" 19 20 "github.com/go-kit/log" 21 "github.com/go-kit/log/level" 22 "github.com/prometheus/client_golang/prometheus" 23 "github.com/prometheus/client_golang/prometheus/promauto" 24 "github.com/weaveworks/common/httpgrpc" 25 "github.com/weaveworks/common/httpgrpc/server" 26 27 querier_stats "github.com/thanos-io/thanos/internal/cortex/querier/stats" 28 "github.com/thanos-io/thanos/internal/cortex/tenant" 29 "github.com/thanos-io/thanos/internal/cortex/util" 30 util_log "github.com/thanos-io/thanos/internal/cortex/util/log" 31 ) 32 33 const ( 34 // StatusClientClosedRequest is the status code for when a client request cancellation of an http request 35 StatusClientClosedRequest = 499 36 ServiceTimingHeaderName = "Server-Timing" 37 ) 38 39 var ( 40 errCanceled = httpgrpc.Errorf(StatusClientClosedRequest, context.Canceled.Error()) 41 errDeadlineExceeded = httpgrpc.Errorf(http.StatusGatewayTimeout, context.DeadlineExceeded.Error()) 42 errRequestEntityTooLarge = httpgrpc.Errorf(http.StatusRequestEntityTooLarge, "http: request body too large") 43 ) 44 45 // Config for a Handler. 46 type HandlerConfig struct { 47 LogQueriesLongerThan time.Duration `yaml:"log_queries_longer_than"` 48 MaxBodySize int64 `yaml:"max_body_size"` 49 QueryStatsEnabled bool `yaml:"query_stats_enabled"` 50 } 51 52 // Handler accepts queries and forwards them to RoundTripper. It can log slow queries, 53 // but all other logic is inside the RoundTripper. 54 type Handler struct { 55 cfg HandlerConfig 56 log log.Logger 57 roundTripper http.RoundTripper 58 59 // Metrics. 60 querySeconds *prometheus.CounterVec 61 querySeries *prometheus.CounterVec 62 queryBytes *prometheus.CounterVec 63 activeUsers *util.ActiveUsersCleanupService 64 } 65 66 // NewHandler creates a new frontend handler. 67 func NewHandler(cfg HandlerConfig, roundTripper http.RoundTripper, log log.Logger, reg prometheus.Registerer) http.Handler { 68 h := &Handler{ 69 cfg: cfg, 70 log: log, 71 roundTripper: roundTripper, 72 } 73 74 if cfg.QueryStatsEnabled { 75 h.querySeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ 76 Name: "cortex_query_seconds_total", 77 Help: "Total amount of wall clock time spend processing queries.", 78 }, []string{"user"}) 79 80 h.querySeries = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ 81 Name: "cortex_query_fetched_series_total", 82 Help: "Number of series fetched to execute a query.", 83 }, []string{"user"}) 84 85 h.queryBytes = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ 86 Name: "cortex_query_fetched_chunks_bytes_total", 87 Help: "Size of all chunks fetched to execute a query in bytes.", 88 }, []string{"user"}) 89 90 h.activeUsers = util.NewActiveUsersCleanupWithDefaultValues(func(user string) { 91 h.querySeconds.DeleteLabelValues(user) 92 h.querySeries.DeleteLabelValues(user) 93 h.queryBytes.DeleteLabelValues(user) 94 }) 95 // If cleaner stops or fail, we will simply not clean the metrics for inactive users. 96 _ = h.activeUsers.StartAsync(context.Background()) 97 } 98 99 return h 100 } 101 102 func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { 103 var ( 104 stats *querier_stats.Stats 105 queryString url.Values 106 ) 107 108 // Initialise the stats in the context and make sure it's propagated 109 // down the request chain. 110 if f.cfg.QueryStatsEnabled { 111 var ctx context.Context 112 stats, ctx = querier_stats.ContextWithEmptyStats(r.Context()) 113 r = r.WithContext(ctx) 114 } 115 116 defer func() { 117 _ = r.Body.Close() 118 }() 119 120 // Buffer the body for later use to track slow queries. 121 var buf bytes.Buffer 122 r.Body = http.MaxBytesReader(w, r.Body, f.cfg.MaxBodySize) 123 r.Body = ioutil.NopCloser(io.TeeReader(r.Body, &buf)) 124 125 startTime := time.Now() 126 resp, err := f.roundTripper.RoundTrip(r) 127 queryResponseTime := time.Since(startTime) 128 129 if err != nil { 130 writeError(w, err) 131 return 132 } 133 134 hs := w.Header() 135 for h, vs := range resp.Header { 136 hs[h] = vs 137 } 138 139 if f.cfg.QueryStatsEnabled { 140 writeServiceTimingHeader(queryResponseTime, hs, stats) 141 } 142 143 w.WriteHeader(resp.StatusCode) 144 // log copy response body error so that we will know even though success response code returned 145 bytesCopied, err := io.Copy(w, resp.Body) 146 if err != nil && !errors.Is(err, syscall.EPIPE) { 147 level.Error(util_log.WithContext(r.Context(), f.log)).Log("msg", "write response body error", "bytesCopied", bytesCopied, "err", err) 148 } 149 150 // Check whether we should parse the query string. 151 shouldReportSlowQuery := f.cfg.LogQueriesLongerThan != 0 && queryResponseTime > f.cfg.LogQueriesLongerThan 152 if shouldReportSlowQuery || f.cfg.QueryStatsEnabled { 153 queryString = f.parseRequestQueryString(r, buf) 154 } 155 156 if shouldReportSlowQuery { 157 f.reportSlowQuery(r, hs, queryString, queryResponseTime) 158 } 159 if f.cfg.QueryStatsEnabled { 160 f.reportQueryStats(r, queryString, queryResponseTime, stats) 161 } 162 } 163 164 // reportSlowQuery reports slow queries. 165 func (f *Handler) reportSlowQuery(r *http.Request, responseHeaders http.Header, queryString url.Values, queryResponseTime time.Duration) { 166 // NOTE(GiedriusS): see https://github.com/grafana/grafana/pull/60301 for more info. 167 grafanaDashboardUID := "-" 168 if dashboardUID := r.Header.Get("X-Dashboard-Uid"); dashboardUID != "" { 169 grafanaDashboardUID = dashboardUID 170 } 171 grafanaPanelID := "-" 172 if panelID := r.Header.Get("X-Panel-Id"); panelID != "" { 173 grafanaPanelID = panelID 174 } 175 thanosTraceID := "-" 176 if traceID := responseHeaders.Get("X-Thanos-Trace-Id"); traceID != "" { 177 thanosTraceID = traceID 178 } 179 180 remoteUser, _, _ := r.BasicAuth() 181 182 logMessage := append([]interface{}{ 183 "msg", "slow query detected", 184 "method", r.Method, 185 "host", r.Host, 186 "path", r.URL.Path, 187 "remote_user", remoteUser, 188 "remote_addr", r.RemoteAddr, 189 "time_taken", queryResponseTime.String(), 190 "grafana_dashboard_uid", grafanaDashboardUID, 191 "grafana_panel_id", grafanaPanelID, 192 "trace_id", thanosTraceID, 193 }, formatQueryString(queryString)...) 194 195 level.Info(util_log.WithContext(r.Context(), f.log)).Log(logMessage...) 196 } 197 198 func (f *Handler) reportQueryStats(r *http.Request, queryString url.Values, queryResponseTime time.Duration, stats *querier_stats.Stats) { 199 tenantIDs, err := tenant.TenantIDs(r.Context()) 200 if err != nil { 201 return 202 } 203 userID := tenant.JoinTenantIDs(tenantIDs) 204 wallTime := stats.LoadWallTime() 205 numSeries := stats.LoadFetchedSeries() 206 numBytes := stats.LoadFetchedChunkBytes() 207 remoteUser, _, _ := r.BasicAuth() 208 209 // Track stats. 210 f.querySeconds.WithLabelValues(userID).Add(wallTime.Seconds()) 211 f.querySeries.WithLabelValues(userID).Add(float64(numSeries)) 212 f.queryBytes.WithLabelValues(userID).Add(float64(numBytes)) 213 f.activeUsers.UpdateUserTimestamp(userID, time.Now()) 214 215 // Log stats. 216 logMessage := append([]interface{}{ 217 "msg", "query stats", 218 "component", "query-frontend", 219 "method", r.Method, 220 "path", r.URL.Path, 221 "remote_user", remoteUser, 222 "remote_addr", r.RemoteAddr, 223 "response_time", queryResponseTime, 224 "query_wall_time_seconds", wallTime.Seconds(), 225 "fetched_series_count", numSeries, 226 "fetched_chunks_bytes", numBytes, 227 }, formatQueryString(queryString)...) 228 229 level.Info(util_log.WithContext(r.Context(), f.log)).Log(logMessage...) 230 } 231 232 func (f *Handler) parseRequestQueryString(r *http.Request, bodyBuf bytes.Buffer) url.Values { 233 // Use previously buffered body. 234 r.Body = ioutil.NopCloser(&bodyBuf) 235 236 // Ensure the form has been parsed so all the parameters are present 237 err := r.ParseForm() 238 if err != nil { 239 level.Warn(util_log.WithContext(r.Context(), f.log)).Log("msg", "unable to parse request form", "err", err) 240 return nil 241 } 242 243 return r.Form 244 } 245 246 func formatQueryString(queryString url.Values) (fields []interface{}) { 247 for k, v := range queryString { 248 fields = append(fields, fmt.Sprintf("param_%s", k), strings.Join(v, ",")) 249 } 250 return fields 251 } 252 253 func writeError(w http.ResponseWriter, err error) { 254 switch err { 255 case context.Canceled: 256 err = errCanceled 257 case context.DeadlineExceeded: 258 err = errDeadlineExceeded 259 default: 260 if util.IsRequestBodyTooLarge(err) { 261 err = errRequestEntityTooLarge 262 } 263 } 264 server.WriteError(w, err) 265 } 266 267 func writeServiceTimingHeader(queryResponseTime time.Duration, headers http.Header, stats *querier_stats.Stats) { 268 if stats != nil { 269 parts := make([]string, 0) 270 parts = append(parts, statsValue("querier_wall_time", stats.LoadWallTime())) 271 parts = append(parts, statsValue("response_time", queryResponseTime)) 272 headers.Set(ServiceTimingHeaderName, strings.Join(parts, ", ")) 273 } 274 } 275 276 func statsValue(name string, d time.Duration) string { 277 durationInMs := strconv.FormatFloat(float64(d)/float64(time.Millisecond), 'f', -1, 64) 278 return name + ";dur=" + durationInMs 279 }