github.com/thanos-io/thanos@v0.32.5/internal/cortex/frontend/transport/handler.go (about)

     1  // Copyright (c) The Cortex Authors.
     2  // Licensed under the Apache License 2.0.
     3  
     4  package transport
     5  
     6  import (
     7  	"bytes"
     8  	"context"
     9  	"errors"
    10  	"fmt"
    11  	"io"
    12  	"io/ioutil"
    13  	"net/http"
    14  	"net/url"
    15  	"strconv"
    16  	"strings"
    17  	"syscall"
    18  	"time"
    19  
    20  	"github.com/go-kit/log"
    21  	"github.com/go-kit/log/level"
    22  	"github.com/prometheus/client_golang/prometheus"
    23  	"github.com/prometheus/client_golang/prometheus/promauto"
    24  	"github.com/weaveworks/common/httpgrpc"
    25  	"github.com/weaveworks/common/httpgrpc/server"
    26  
    27  	querier_stats "github.com/thanos-io/thanos/internal/cortex/querier/stats"
    28  	"github.com/thanos-io/thanos/internal/cortex/tenant"
    29  	"github.com/thanos-io/thanos/internal/cortex/util"
    30  	util_log "github.com/thanos-io/thanos/internal/cortex/util/log"
    31  )
    32  
    33  const (
    34  	// StatusClientClosedRequest is the status code for when a client request cancellation of an http request
    35  	StatusClientClosedRequest = 499
    36  	ServiceTimingHeaderName   = "Server-Timing"
    37  )
    38  
    39  var (
    40  	errCanceled              = httpgrpc.Errorf(StatusClientClosedRequest, context.Canceled.Error())
    41  	errDeadlineExceeded      = httpgrpc.Errorf(http.StatusGatewayTimeout, context.DeadlineExceeded.Error())
    42  	errRequestEntityTooLarge = httpgrpc.Errorf(http.StatusRequestEntityTooLarge, "http: request body too large")
    43  )
    44  
    45  // Config for a Handler.
    46  type HandlerConfig struct {
    47  	LogQueriesLongerThan time.Duration `yaml:"log_queries_longer_than"`
    48  	MaxBodySize          int64         `yaml:"max_body_size"`
    49  	QueryStatsEnabled    bool          `yaml:"query_stats_enabled"`
    50  }
    51  
    52  // Handler accepts queries and forwards them to RoundTripper. It can log slow queries,
    53  // but all other logic is inside the RoundTripper.
    54  type Handler struct {
    55  	cfg          HandlerConfig
    56  	log          log.Logger
    57  	roundTripper http.RoundTripper
    58  
    59  	// Metrics.
    60  	querySeconds *prometheus.CounterVec
    61  	querySeries  *prometheus.CounterVec
    62  	queryBytes   *prometheus.CounterVec
    63  	activeUsers  *util.ActiveUsersCleanupService
    64  }
    65  
    66  // NewHandler creates a new frontend handler.
    67  func NewHandler(cfg HandlerConfig, roundTripper http.RoundTripper, log log.Logger, reg prometheus.Registerer) http.Handler {
    68  	h := &Handler{
    69  		cfg:          cfg,
    70  		log:          log,
    71  		roundTripper: roundTripper,
    72  	}
    73  
    74  	if cfg.QueryStatsEnabled {
    75  		h.querySeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
    76  			Name: "cortex_query_seconds_total",
    77  			Help: "Total amount of wall clock time spend processing queries.",
    78  		}, []string{"user"})
    79  
    80  		h.querySeries = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
    81  			Name: "cortex_query_fetched_series_total",
    82  			Help: "Number of series fetched to execute a query.",
    83  		}, []string{"user"})
    84  
    85  		h.queryBytes = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
    86  			Name: "cortex_query_fetched_chunks_bytes_total",
    87  			Help: "Size of all chunks fetched to execute a query in bytes.",
    88  		}, []string{"user"})
    89  
    90  		h.activeUsers = util.NewActiveUsersCleanupWithDefaultValues(func(user string) {
    91  			h.querySeconds.DeleteLabelValues(user)
    92  			h.querySeries.DeleteLabelValues(user)
    93  			h.queryBytes.DeleteLabelValues(user)
    94  		})
    95  		// If cleaner stops or fail, we will simply not clean the metrics for inactive users.
    96  		_ = h.activeUsers.StartAsync(context.Background())
    97  	}
    98  
    99  	return h
   100  }
   101  
   102  func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
   103  	var (
   104  		stats       *querier_stats.Stats
   105  		queryString url.Values
   106  	)
   107  
   108  	// Initialise the stats in the context and make sure it's propagated
   109  	// down the request chain.
   110  	if f.cfg.QueryStatsEnabled {
   111  		var ctx context.Context
   112  		stats, ctx = querier_stats.ContextWithEmptyStats(r.Context())
   113  		r = r.WithContext(ctx)
   114  	}
   115  
   116  	defer func() {
   117  		_ = r.Body.Close()
   118  	}()
   119  
   120  	// Buffer the body for later use to track slow queries.
   121  	var buf bytes.Buffer
   122  	r.Body = http.MaxBytesReader(w, r.Body, f.cfg.MaxBodySize)
   123  	r.Body = ioutil.NopCloser(io.TeeReader(r.Body, &buf))
   124  
   125  	startTime := time.Now()
   126  	resp, err := f.roundTripper.RoundTrip(r)
   127  	queryResponseTime := time.Since(startTime)
   128  
   129  	if err != nil {
   130  		writeError(w, err)
   131  		return
   132  	}
   133  
   134  	hs := w.Header()
   135  	for h, vs := range resp.Header {
   136  		hs[h] = vs
   137  	}
   138  
   139  	if f.cfg.QueryStatsEnabled {
   140  		writeServiceTimingHeader(queryResponseTime, hs, stats)
   141  	}
   142  
   143  	w.WriteHeader(resp.StatusCode)
   144  	// log copy response body error so that we will know even though success response code returned
   145  	bytesCopied, err := io.Copy(w, resp.Body)
   146  	if err != nil && !errors.Is(err, syscall.EPIPE) {
   147  		level.Error(util_log.WithContext(r.Context(), f.log)).Log("msg", "write response body error", "bytesCopied", bytesCopied, "err", err)
   148  	}
   149  
   150  	// Check whether we should parse the query string.
   151  	shouldReportSlowQuery := f.cfg.LogQueriesLongerThan != 0 && queryResponseTime > f.cfg.LogQueriesLongerThan
   152  	if shouldReportSlowQuery || f.cfg.QueryStatsEnabled {
   153  		queryString = f.parseRequestQueryString(r, buf)
   154  	}
   155  
   156  	if shouldReportSlowQuery {
   157  		f.reportSlowQuery(r, hs, queryString, queryResponseTime)
   158  	}
   159  	if f.cfg.QueryStatsEnabled {
   160  		f.reportQueryStats(r, queryString, queryResponseTime, stats)
   161  	}
   162  }
   163  
   164  // reportSlowQuery reports slow queries.
   165  func (f *Handler) reportSlowQuery(r *http.Request, responseHeaders http.Header, queryString url.Values, queryResponseTime time.Duration) {
   166  	// NOTE(GiedriusS): see https://github.com/grafana/grafana/pull/60301 for more info.
   167  	grafanaDashboardUID := "-"
   168  	if dashboardUID := r.Header.Get("X-Dashboard-Uid"); dashboardUID != "" {
   169  		grafanaDashboardUID = dashboardUID
   170  	}
   171  	grafanaPanelID := "-"
   172  	if panelID := r.Header.Get("X-Panel-Id"); panelID != "" {
   173  		grafanaPanelID = panelID
   174  	}
   175  	thanosTraceID := "-"
   176  	if traceID := responseHeaders.Get("X-Thanos-Trace-Id"); traceID != "" {
   177  		thanosTraceID = traceID
   178  	}
   179  
   180  	remoteUser, _, _ := r.BasicAuth()
   181  
   182  	logMessage := append([]interface{}{
   183  		"msg", "slow query detected",
   184  		"method", r.Method,
   185  		"host", r.Host,
   186  		"path", r.URL.Path,
   187  		"remote_user", remoteUser,
   188  		"remote_addr", r.RemoteAddr,
   189  		"time_taken", queryResponseTime.String(),
   190  		"grafana_dashboard_uid", grafanaDashboardUID,
   191  		"grafana_panel_id", grafanaPanelID,
   192  		"trace_id", thanosTraceID,
   193  	}, formatQueryString(queryString)...)
   194  
   195  	level.Info(util_log.WithContext(r.Context(), f.log)).Log(logMessage...)
   196  }
   197  
   198  func (f *Handler) reportQueryStats(r *http.Request, queryString url.Values, queryResponseTime time.Duration, stats *querier_stats.Stats) {
   199  	tenantIDs, err := tenant.TenantIDs(r.Context())
   200  	if err != nil {
   201  		return
   202  	}
   203  	userID := tenant.JoinTenantIDs(tenantIDs)
   204  	wallTime := stats.LoadWallTime()
   205  	numSeries := stats.LoadFetchedSeries()
   206  	numBytes := stats.LoadFetchedChunkBytes()
   207  	remoteUser, _, _ := r.BasicAuth()
   208  
   209  	// Track stats.
   210  	f.querySeconds.WithLabelValues(userID).Add(wallTime.Seconds())
   211  	f.querySeries.WithLabelValues(userID).Add(float64(numSeries))
   212  	f.queryBytes.WithLabelValues(userID).Add(float64(numBytes))
   213  	f.activeUsers.UpdateUserTimestamp(userID, time.Now())
   214  
   215  	// Log stats.
   216  	logMessage := append([]interface{}{
   217  		"msg", "query stats",
   218  		"component", "query-frontend",
   219  		"method", r.Method,
   220  		"path", r.URL.Path,
   221  		"remote_user", remoteUser,
   222  		"remote_addr", r.RemoteAddr,
   223  		"response_time", queryResponseTime,
   224  		"query_wall_time_seconds", wallTime.Seconds(),
   225  		"fetched_series_count", numSeries,
   226  		"fetched_chunks_bytes", numBytes,
   227  	}, formatQueryString(queryString)...)
   228  
   229  	level.Info(util_log.WithContext(r.Context(), f.log)).Log(logMessage...)
   230  }
   231  
   232  func (f *Handler) parseRequestQueryString(r *http.Request, bodyBuf bytes.Buffer) url.Values {
   233  	// Use previously buffered body.
   234  	r.Body = ioutil.NopCloser(&bodyBuf)
   235  
   236  	// Ensure the form has been parsed so all the parameters are present
   237  	err := r.ParseForm()
   238  	if err != nil {
   239  		level.Warn(util_log.WithContext(r.Context(), f.log)).Log("msg", "unable to parse request form", "err", err)
   240  		return nil
   241  	}
   242  
   243  	return r.Form
   244  }
   245  
   246  func formatQueryString(queryString url.Values) (fields []interface{}) {
   247  	for k, v := range queryString {
   248  		fields = append(fields, fmt.Sprintf("param_%s", k), strings.Join(v, ","))
   249  	}
   250  	return fields
   251  }
   252  
   253  func writeError(w http.ResponseWriter, err error) {
   254  	switch err {
   255  	case context.Canceled:
   256  		err = errCanceled
   257  	case context.DeadlineExceeded:
   258  		err = errDeadlineExceeded
   259  	default:
   260  		if util.IsRequestBodyTooLarge(err) {
   261  			err = errRequestEntityTooLarge
   262  		}
   263  	}
   264  	server.WriteError(w, err)
   265  }
   266  
   267  func writeServiceTimingHeader(queryResponseTime time.Duration, headers http.Header, stats *querier_stats.Stats) {
   268  	if stats != nil {
   269  		parts := make([]string, 0)
   270  		parts = append(parts, statsValue("querier_wall_time", stats.LoadWallTime()))
   271  		parts = append(parts, statsValue("response_time", queryResponseTime))
   272  		headers.Set(ServiceTimingHeaderName, strings.Join(parts, ", "))
   273  	}
   274  }
   275  
   276  func statsValue(name string, d time.Duration) string {
   277  	durationInMs := strconv.FormatFloat(float64(d)/float64(time.Millisecond), 'f', -1, 64)
   278  	return name + ";dur=" + durationInMs
   279  }