github.com/thanos-io/thanos@v0.32.5/pkg/receive/handler.go (about)

     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     3  
     4  package receive
     5  
     6  import (
     7  	"bytes"
     8  	"context"
     9  	"crypto/tls"
    10  	"fmt"
    11  	"io"
    12  	stdlog "log"
    13  	"math"
    14  	"net"
    15  	"net/http"
    16  	"sort"
    17  	"strconv"
    18  	"sync"
    19  	"time"
    20  
    21  	"github.com/go-kit/log"
    22  	"github.com/go-kit/log/level"
    23  	"github.com/gogo/protobuf/proto"
    24  	"github.com/jpillora/backoff"
    25  	"github.com/klauspost/compress/s2"
    26  	"github.com/mwitkow/go-conntrack"
    27  	"github.com/opentracing/opentracing-go"
    28  	"github.com/pkg/errors"
    29  	"github.com/prometheus/client_golang/prometheus"
    30  	"github.com/prometheus/client_golang/prometheus/promauto"
    31  	"github.com/prometheus/common/route"
    32  	"github.com/prometheus/prometheus/model/relabel"
    33  	"github.com/prometheus/prometheus/storage"
    34  	"github.com/prometheus/prometheus/tsdb"
    35  	"google.golang.org/grpc"
    36  	"google.golang.org/grpc/codes"
    37  	"google.golang.org/grpc/status"
    38  
    39  	"github.com/thanos-io/thanos/pkg/api"
    40  	statusapi "github.com/thanos-io/thanos/pkg/api/status"
    41  	"github.com/thanos-io/thanos/pkg/logging"
    42  
    43  	extpromhttp "github.com/thanos-io/thanos/pkg/extprom/http"
    44  	"github.com/thanos-io/thanos/pkg/runutil"
    45  	"github.com/thanos-io/thanos/pkg/server/http/middleware"
    46  	"github.com/thanos-io/thanos/pkg/store/labelpb"
    47  	"github.com/thanos-io/thanos/pkg/store/storepb"
    48  	"github.com/thanos-io/thanos/pkg/store/storepb/prompb"
    49  	"github.com/thanos-io/thanos/pkg/tenancy"
    50  	"github.com/thanos-io/thanos/pkg/tracing"
    51  )
    52  
    53  const (
    54  	// DefaultStatsLimit is the default value used for limiting tenant stats.
    55  	DefaultStatsLimit = 10
    56  	// DefaultReplicaHeader is the default header used to designate the replica count of a write request.
    57  	DefaultReplicaHeader = "THANOS-REPLICA"
    58  	// AllTenantsQueryParam is the query parameter for getting TSDB stats for all tenants.
    59  	AllTenantsQueryParam = "all_tenants"
    60  	// LimitStatsQueryParam is the query parameter for limiting the amount of returned TSDB stats.
    61  	LimitStatsQueryParam = "limit"
    62  	// Labels for metrics.
    63  	labelSuccess = "success"
    64  	labelError   = "error"
    65  )
    66  
    67  var (
    68  	// errConflict is returned whenever an operation fails due to any conflict-type error.
    69  	errConflict = errors.New("conflict")
    70  
    71  	errBadReplica  = errors.New("request replica exceeds receiver replication factor")
    72  	errNotReady    = errors.New("target not ready")
    73  	errUnavailable = errors.New("target not available")
    74  	errInternal    = errors.New("internal error")
    75  )
    76  
    77  // Options for the web Handler.
    78  type Options struct {
    79  	Writer            *Writer
    80  	ListenAddress     string
    81  	Registry          *prometheus.Registry
    82  	TenantHeader      string
    83  	TenantField       string
    84  	DefaultTenantID   string
    85  	ReplicaHeader     string
    86  	Endpoint          string
    87  	ReplicationFactor uint64
    88  	ReceiverMode      ReceiverMode
    89  	Tracer            opentracing.Tracer
    90  	TLSConfig         *tls.Config
    91  	DialOpts          []grpc.DialOption
    92  	ForwardTimeout    time.Duration
    93  	MaxBackoff        time.Duration
    94  	RelabelConfigs    []*relabel.Config
    95  	TSDBStats         TSDBStats
    96  	Limiter           *Limiter
    97  }
    98  
    99  // Handler serves a Prometheus remote write receiving HTTP endpoint.
   100  type Handler struct {
   101  	logger   log.Logger
   102  	writer   *Writer
   103  	router   *route.Router
   104  	options  *Options
   105  	listener net.Listener
   106  
   107  	mtx          sync.RWMutex
   108  	hashring     Hashring
   109  	peers        *peerGroup
   110  	expBackoff   backoff.Backoff
   111  	peerStates   map[string]*retryState
   112  	receiverMode ReceiverMode
   113  
   114  	forwardRequests   *prometheus.CounterVec
   115  	replications      *prometheus.CounterVec
   116  	replicationFactor prometheus.Gauge
   117  
   118  	writeSamplesTotal    *prometheus.HistogramVec
   119  	writeTimeseriesTotal *prometheus.HistogramVec
   120  
   121  	Limiter *Limiter
   122  }
   123  
   124  func NewHandler(logger log.Logger, o *Options) *Handler {
   125  	if logger == nil {
   126  		logger = log.NewNopLogger()
   127  	}
   128  
   129  	var registerer prometheus.Registerer = nil
   130  	if o.Registry != nil {
   131  		registerer = o.Registry
   132  	}
   133  
   134  	h := &Handler{
   135  		logger:       logger,
   136  		writer:       o.Writer,
   137  		router:       route.New(),
   138  		options:      o,
   139  		peers:        newPeerGroup(o.DialOpts...),
   140  		receiverMode: o.ReceiverMode,
   141  		expBackoff: backoff.Backoff{
   142  			Factor: 2,
   143  			Min:    100 * time.Millisecond,
   144  			Max:    o.MaxBackoff,
   145  			Jitter: true,
   146  		},
   147  		Limiter: o.Limiter,
   148  		forwardRequests: promauto.With(registerer).NewCounterVec(
   149  			prometheus.CounterOpts{
   150  				Name: "thanos_receive_forward_requests_total",
   151  				Help: "The number of forward requests.",
   152  			}, []string{"result"},
   153  		),
   154  		replications: promauto.With(registerer).NewCounterVec(
   155  			prometheus.CounterOpts{
   156  				Name: "thanos_receive_replications_total",
   157  				Help: "The number of replication operations done by the receiver. The success of replication is fulfilled when a quorum is met.",
   158  			}, []string{"result"},
   159  		),
   160  		replicationFactor: promauto.With(registerer).NewGauge(
   161  			prometheus.GaugeOpts{
   162  				Name: "thanos_receive_replication_factor",
   163  				Help: "The number of times to replicate incoming write requests.",
   164  			},
   165  		),
   166  		writeTimeseriesTotal: promauto.With(registerer).NewHistogramVec(
   167  			prometheus.HistogramOpts{
   168  				Namespace: "thanos",
   169  				Subsystem: "receive",
   170  				Name:      "write_timeseries",
   171  				Help:      "The number of timeseries received in the incoming write requests.",
   172  				Buckets:   []float64{10, 50, 100, 500, 1000, 5000, 10000},
   173  			}, []string{"code", "tenant"},
   174  		),
   175  		writeSamplesTotal: promauto.With(registerer).NewHistogramVec(
   176  			prometheus.HistogramOpts{
   177  				Namespace: "thanos",
   178  				Subsystem: "receive",
   179  				Name:      "write_samples",
   180  				Help:      "The number of sampled received in the incoming write requests.",
   181  				Buckets:   []float64{10, 50, 100, 500, 1000, 5000, 10000},
   182  			}, []string{"code", "tenant"},
   183  		),
   184  	}
   185  
   186  	h.forwardRequests.WithLabelValues(labelSuccess)
   187  	h.forwardRequests.WithLabelValues(labelError)
   188  	h.replications.WithLabelValues(labelSuccess)
   189  	h.replications.WithLabelValues(labelError)
   190  
   191  	if o.ReplicationFactor > 1 {
   192  		h.replicationFactor.Set(float64(o.ReplicationFactor))
   193  	} else {
   194  		h.replicationFactor.Set(1)
   195  	}
   196  
   197  	ins := extpromhttp.NewNopInstrumentationMiddleware()
   198  	if o.Registry != nil {
   199  		ins = extpromhttp.NewTenantInstrumentationMiddleware(
   200  			o.TenantHeader,
   201  			o.Registry,
   202  			[]float64{0.001, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.25, 0.5, 0.75, 1, 2, 3, 4, 5},
   203  		)
   204  	}
   205  
   206  	readyf := h.testReady
   207  	instrf := func(name string, next func(w http.ResponseWriter, r *http.Request)) http.HandlerFunc {
   208  		next = ins.NewHandler(name, http.HandlerFunc(next))
   209  
   210  		if o.Tracer != nil {
   211  			next = tracing.HTTPMiddleware(o.Tracer, name, logger, http.HandlerFunc(next))
   212  		}
   213  		return next
   214  	}
   215  
   216  	h.router.Post(
   217  		"/api/v1/receive",
   218  		instrf(
   219  			"receive",
   220  			readyf(
   221  				middleware.RequestID(
   222  					http.HandlerFunc(h.receiveHTTP),
   223  				),
   224  			),
   225  		),
   226  	)
   227  
   228  	statusAPI := statusapi.New(statusapi.Options{
   229  		GetStats: h.getStats,
   230  		Registry: h.options.Registry,
   231  	})
   232  	statusAPI.Register(h.router, o.Tracer, logger, ins, logging.NewHTTPServerMiddleware(logger))
   233  
   234  	return h
   235  }
   236  
   237  // Hashring sets the hashring for the handler and marks the hashring as ready.
   238  // The hashring must be set to a non-nil value in order for the
   239  // handler to be ready and usable.
   240  // If the hashring is nil, then the handler is marked as not ready.
   241  func (h *Handler) Hashring(hashring Hashring) {
   242  	h.mtx.Lock()
   243  	defer h.mtx.Unlock()
   244  
   245  	h.hashring = hashring
   246  	h.expBackoff.Reset()
   247  	h.peerStates = make(map[string]*retryState)
   248  }
   249  
   250  // Verifies whether the server is ready or not.
   251  func (h *Handler) isReady() bool {
   252  	h.mtx.RLock()
   253  	hr := h.hashring != nil
   254  	sr := h.writer != nil
   255  	h.mtx.RUnlock()
   256  	return sr && hr
   257  }
   258  
   259  // Checks if server is ready, calls f if it is, returns 503 if it is not.
   260  func (h *Handler) testReady(f http.HandlerFunc) http.HandlerFunc {
   261  	return func(w http.ResponseWriter, r *http.Request) {
   262  		if h.isReady() {
   263  			f(w, r)
   264  			return
   265  		}
   266  
   267  		w.WriteHeader(http.StatusServiceUnavailable)
   268  		_, err := fmt.Fprintf(w, "Service Unavailable")
   269  		if err != nil {
   270  			h.logger.Log("msg", "failed to write to response body", "err", err)
   271  		}
   272  	}
   273  }
   274  
   275  func getStatsLimitParameter(r *http.Request) (int, error) {
   276  	statsLimitStr := r.URL.Query().Get(LimitStatsQueryParam)
   277  	if statsLimitStr == "" {
   278  		return DefaultStatsLimit, nil
   279  	}
   280  	statsLimit, err := strconv.ParseInt(statsLimitStr, 10, 0)
   281  	if err != nil {
   282  		return 0, fmt.Errorf("unable to parse '%s' parameter: %w", LimitStatsQueryParam, err)
   283  	}
   284  	if statsLimit > math.MaxInt {
   285  		return 0, fmt.Errorf("'%s' parameter is larger than %d", LimitStatsQueryParam, math.MaxInt)
   286  	}
   287  	return int(statsLimit), nil
   288  }
   289  
   290  func (h *Handler) getStats(r *http.Request, statsByLabelName string) ([]statusapi.TenantStats, *api.ApiError) {
   291  	if !h.isReady() {
   292  		return nil, &api.ApiError{Typ: api.ErrorInternal, Err: fmt.Errorf("service unavailable")}
   293  	}
   294  
   295  	tenantID := r.Header.Get(h.options.TenantHeader)
   296  	getAllTenantStats := r.FormValue(AllTenantsQueryParam) == "true"
   297  	if getAllTenantStats && tenantID != "" {
   298  		err := fmt.Errorf("using both the %s parameter and the %s header is not supported", AllTenantsQueryParam, h.options.TenantHeader)
   299  		return nil, &api.ApiError{Typ: api.ErrorBadData, Err: err}
   300  	}
   301  
   302  	statsLimit, err := getStatsLimitParameter(r)
   303  	if err != nil {
   304  		return nil, &api.ApiError{Typ: api.ErrorBadData, Err: err}
   305  	}
   306  
   307  	if getAllTenantStats {
   308  		return h.options.TSDBStats.TenantStats(statsLimit, statsByLabelName), nil
   309  	}
   310  
   311  	if tenantID == "" {
   312  		tenantID = h.options.DefaultTenantID
   313  	}
   314  
   315  	return h.options.TSDBStats.TenantStats(statsLimit, statsByLabelName, tenantID), nil
   316  }
   317  
   318  // Close stops the Handler.
   319  func (h *Handler) Close() {
   320  	if h.listener != nil {
   321  		runutil.CloseWithLogOnErr(h.logger, h.listener, "receive HTTP listener")
   322  	}
   323  }
   324  
   325  // Run serves the HTTP endpoints.
   326  func (h *Handler) Run() error {
   327  	level.Info(h.logger).Log("msg", "Start listening for connections", "address", h.options.ListenAddress)
   328  
   329  	var err error
   330  	h.listener, err = net.Listen("tcp", h.options.ListenAddress)
   331  	if err != nil {
   332  		return err
   333  	}
   334  
   335  	// Monitor incoming connections with conntrack.
   336  	h.listener = conntrack.NewListener(h.listener,
   337  		conntrack.TrackWithName("http"),
   338  		conntrack.TrackWithTracing())
   339  
   340  	errlog := stdlog.New(log.NewStdlibAdapter(level.Error(h.logger)), "", 0)
   341  
   342  	httpSrv := &http.Server{
   343  		Handler:   h.router,
   344  		ErrorLog:  errlog,
   345  		TLSConfig: h.options.TLSConfig,
   346  	}
   347  
   348  	if h.options.TLSConfig != nil {
   349  		level.Info(h.logger).Log("msg", "Serving HTTPS", "address", h.options.ListenAddress)
   350  		// Cert & Key are already being passed in via TLSConfig.
   351  		return httpSrv.ServeTLS(h.listener, "", "")
   352  	}
   353  
   354  	level.Info(h.logger).Log("msg", "Serving plain HTTP", "address", h.options.ListenAddress)
   355  	return httpSrv.Serve(h.listener)
   356  }
   357  
   358  // replica encapsulates the replica number of a request and if the request is
   359  // already replicated.
   360  type replica struct {
   361  	n          uint64
   362  	replicated bool
   363  }
   364  
   365  // endpointReplica is a pair of a receive endpoint and a write request replica.
   366  type endpointReplica struct {
   367  	endpoint string
   368  	replica  uint64
   369  }
   370  
   371  type trackedSeries struct {
   372  	seriesIDs  []int
   373  	timeSeries []prompb.TimeSeries
   374  }
   375  
   376  type writeResponse struct {
   377  	seriesIDs []int
   378  	err       error
   379  }
   380  
   381  func newWriteResponse(seriesIDs []int, err error) writeResponse {
   382  	return writeResponse{
   383  		seriesIDs: seriesIDs,
   384  		err:       err,
   385  	}
   386  }
   387  
   388  func (h *Handler) handleRequest(ctx context.Context, rep uint64, tenant string, wreq *prompb.WriteRequest) error {
   389  	tLogger := log.With(h.logger, "tenant", tenant)
   390  
   391  	// This replica value is used to detect cycles in cyclic topologies.
   392  	// A non-zero value indicates that the request has already been replicated by a previous receive instance.
   393  	// For almost all users, this is only used in fully connected topologies of IngestorRouter instances.
   394  	// For acyclic topologies that use RouterOnly and IngestorOnly instances, this causes issues when replicating data.
   395  	// See discussion in: https://github.com/thanos-io/thanos/issues/4359.
   396  	if h.receiverMode == RouterOnly || h.receiverMode == IngestorOnly {
   397  		rep = 0
   398  	}
   399  
   400  	// The replica value in the header is one-indexed, thus we need >.
   401  	if rep > h.options.ReplicationFactor {
   402  		level.Error(tLogger).Log("err", errBadReplica, "msg", "write request rejected",
   403  			"request_replica", rep, "replication_factor", h.options.ReplicationFactor)
   404  		return errBadReplica
   405  	}
   406  
   407  	r := replica{n: rep, replicated: rep != 0}
   408  
   409  	// On the wire, format is 1-indexed and in-code is 0-indexed, so we decrement the value if it was already replicated.
   410  	if r.replicated {
   411  		r.n--
   412  	}
   413  
   414  	// Forward any time series as necessary. All time series
   415  	// destined for the local node will be written to the receiver.
   416  	// Time series will be replicated as necessary.
   417  	return h.forward(ctx, tenant, r, wreq)
   418  }
   419  
   420  func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) {
   421  	var err error
   422  	span, ctx := tracing.StartSpan(r.Context(), "receive_http")
   423  	defer span.Finish()
   424  
   425  	tenant, err := tenancy.GetTenantFromHTTP(r, h.options.TenantHeader, h.options.DefaultTenantID, h.options.TenantField)
   426  	if err != nil {
   427  		level.Error(h.logger).Log("msg", "error getting tenant from HTTP", "err", err)
   428  		http.Error(w, err.Error(), http.StatusBadRequest)
   429  		return
   430  	}
   431  
   432  	tLogger := log.With(h.logger, "tenant", tenant)
   433  
   434  	writeGate := h.Limiter.WriteGate()
   435  	tracing.DoInSpan(r.Context(), "receive_write_gate_ismyturn", func(ctx context.Context) {
   436  		err = writeGate.Start(r.Context())
   437  	})
   438  	defer writeGate.Done()
   439  	if err != nil {
   440  		level.Error(tLogger).Log("err", err, "msg", "internal server error")
   441  		http.Error(w, err.Error(), http.StatusInternalServerError)
   442  		return
   443  	}
   444  
   445  	under, err := h.Limiter.HeadSeriesLimiter.isUnderLimit(tenant)
   446  	if err != nil {
   447  		level.Error(tLogger).Log("msg", "error while limiting", "err", err.Error())
   448  	}
   449  
   450  	// Fail request fully if tenant has exceeded set limit.
   451  	if !under {
   452  		http.Error(w, "tenant is above active series limit", http.StatusTooManyRequests)
   453  		return
   454  	}
   455  
   456  	requestLimiter := h.Limiter.RequestLimiter()
   457  	// io.ReadAll dynamically adjust the byte slice for read data, starting from 512B.
   458  	// Since this is receive hot path, grow upfront saving allocations and CPU time.
   459  	compressed := bytes.Buffer{}
   460  	if r.ContentLength >= 0 {
   461  		if !requestLimiter.AllowSizeBytes(tenant, r.ContentLength) {
   462  			http.Error(w, "write request too large", http.StatusRequestEntityTooLarge)
   463  			return
   464  		}
   465  		compressed.Grow(int(r.ContentLength))
   466  	} else {
   467  		compressed.Grow(512)
   468  	}
   469  	_, err = io.Copy(&compressed, r.Body)
   470  	if err != nil {
   471  		http.Error(w, errors.Wrap(err, "read compressed request body").Error(), http.StatusInternalServerError)
   472  		return
   473  	}
   474  	reqBuf, err := s2.Decode(nil, compressed.Bytes())
   475  	if err != nil {
   476  		level.Error(tLogger).Log("msg", "snappy decode error", "err", err)
   477  		http.Error(w, errors.Wrap(err, "snappy decode error").Error(), http.StatusBadRequest)
   478  		return
   479  	}
   480  
   481  	if !requestLimiter.AllowSizeBytes(tenant, int64(len(reqBuf))) {
   482  		http.Error(w, "write request too large", http.StatusRequestEntityTooLarge)
   483  		return
   484  	}
   485  
   486  	// NOTE: Due to zero copy ZLabels, Labels used from WriteRequests keeps memory
   487  	// from the whole request. Ensure that we always copy those when we want to
   488  	// store them for longer time.
   489  	var wreq prompb.WriteRequest
   490  	if err := proto.Unmarshal(reqBuf, &wreq); err != nil {
   491  		http.Error(w, err.Error(), http.StatusBadRequest)
   492  		return
   493  	}
   494  
   495  	rep := uint64(0)
   496  	// If the header is empty, we assume the request is not yet replicated.
   497  	if replicaRaw := r.Header.Get(h.options.ReplicaHeader); replicaRaw != "" {
   498  		if rep, err = strconv.ParseUint(replicaRaw, 10, 64); err != nil {
   499  			http.Error(w, "could not parse replica header", http.StatusBadRequest)
   500  			return
   501  		}
   502  	}
   503  
   504  	// Exit early if the request contained no data. We don't support metadata yet. We also cannot fail here, because
   505  	// this would mean lack of forward compatibility for remote write proto.
   506  	if len(wreq.Timeseries) == 0 {
   507  		// TODO(yeya24): Handle remote write metadata.
   508  		if len(wreq.Metadata) > 0 {
   509  			// TODO(bwplotka): Do we need this error message?
   510  			level.Debug(tLogger).Log("msg", "only metadata from client; metadata ingestion not supported; skipping")
   511  			return
   512  		}
   513  		level.Debug(tLogger).Log("msg", "empty remote write request; client bug or newer remote write protocol used?; skipping")
   514  		return
   515  	}
   516  
   517  	if !requestLimiter.AllowSeries(tenant, int64(len(wreq.Timeseries))) {
   518  		http.Error(w, "too many timeseries", http.StatusRequestEntityTooLarge)
   519  		return
   520  	}
   521  
   522  	totalSamples := 0
   523  	for _, timeseries := range wreq.Timeseries {
   524  		totalSamples += len(timeseries.Samples)
   525  	}
   526  	if !requestLimiter.AllowSamples(tenant, int64(totalSamples)) {
   527  		http.Error(w, "too many samples", http.StatusRequestEntityTooLarge)
   528  		return
   529  	}
   530  
   531  	// Apply relabeling configs.
   532  	h.relabel(&wreq)
   533  	if len(wreq.Timeseries) == 0 {
   534  		level.Debug(tLogger).Log("msg", "remote write request dropped due to relabeling.")
   535  		return
   536  	}
   537  
   538  	responseStatusCode := http.StatusOK
   539  	if err = h.handleRequest(ctx, rep, tenant, &wreq); err != nil {
   540  		level.Debug(tLogger).Log("msg", "failed to handle request", "err", err)
   541  		switch errors.Cause(err) {
   542  		case errNotReady:
   543  			responseStatusCode = http.StatusServiceUnavailable
   544  		case errUnavailable:
   545  			responseStatusCode = http.StatusServiceUnavailable
   546  		case errConflict:
   547  			responseStatusCode = http.StatusConflict
   548  		case errBadReplica:
   549  			responseStatusCode = http.StatusBadRequest
   550  		default:
   551  			level.Error(tLogger).Log("err", err, "msg", "internal server error")
   552  			responseStatusCode = http.StatusInternalServerError
   553  		}
   554  		http.Error(w, err.Error(), responseStatusCode)
   555  	}
   556  	h.writeTimeseriesTotal.WithLabelValues(strconv.Itoa(responseStatusCode), tenant).Observe(float64(len(wreq.Timeseries)))
   557  	h.writeSamplesTotal.WithLabelValues(strconv.Itoa(responseStatusCode), tenant).Observe(float64(totalSamples))
   558  }
   559  
   560  // forward accepts a write request, batches its time series by
   561  // corresponding endpoint, and forwards them in parallel to the
   562  // correct endpoint. Requests destined for the local node are written
   563  // the local receiver. For a given write request, at most one outgoing
   564  // write request will be made to every other node in the hashring,
   565  // unless the request needs to be replicated.
   566  // The function only returns when all requests have finished
   567  // or the context is canceled.
   568  func (h *Handler) forward(ctx context.Context, tenant string, r replica, wreq *prompb.WriteRequest) error {
   569  	span, ctx := tracing.StartSpan(ctx, "receive_fanout_forward")
   570  	defer span.Finish()
   571  
   572  	// It is possible that hashring is ready in testReady() but unready now,
   573  	// so need to lock here.
   574  	h.mtx.RLock()
   575  	if h.hashring == nil {
   576  		h.mtx.RUnlock()
   577  		return errors.New("hashring is not ready")
   578  	}
   579  
   580  	var replicas []uint64
   581  	if r.replicated {
   582  		replicas = []uint64{r.n}
   583  	} else {
   584  		for rn := uint64(0); rn < h.options.ReplicationFactor; rn++ {
   585  			replicas = append(replicas, rn)
   586  		}
   587  	}
   588  
   589  	wreqs := make(map[endpointReplica]trackedSeries)
   590  	for tsID, ts := range wreq.Timeseries {
   591  		for _, rn := range replicas {
   592  			endpoint, err := h.hashring.GetN(tenant, &ts, rn)
   593  			if err != nil {
   594  				h.mtx.RUnlock()
   595  				return err
   596  			}
   597  			key := endpointReplica{endpoint: endpoint, replica: rn}
   598  			writeTarget, ok := wreqs[key]
   599  			if !ok {
   600  				writeTarget = trackedSeries{
   601  					seriesIDs:  make([]int, 0),
   602  					timeSeries: make([]prompb.TimeSeries, 0),
   603  				}
   604  			}
   605  			writeTarget.timeSeries = append(wreqs[key].timeSeries, ts)
   606  			writeTarget.seriesIDs = append(wreqs[key].seriesIDs, tsID)
   607  			wreqs[key] = writeTarget
   608  		}
   609  	}
   610  	h.mtx.RUnlock()
   611  
   612  	return h.fanoutForward(ctx, tenant, wreqs, len(wreq.Timeseries), r.replicated)
   613  }
   614  
   615  // writeQuorum returns minimum number of replicas that has to confirm write success before claiming replication success.
   616  func (h *Handler) writeQuorum() int {
   617  	return int((h.options.ReplicationFactor / 2) + 1)
   618  }
   619  
   620  func quorumReached(successes []int, successThreshold int) bool {
   621  	for _, success := range successes {
   622  		if success < successThreshold {
   623  			return false
   624  		}
   625  	}
   626  
   627  	return true
   628  }
   629  
   630  // fanoutForward fans out concurrently given set of write requests. It returns status immediately when quorum of
   631  // requests succeeds or fails or if context is canceled.
   632  func (h *Handler) fanoutForward(pctx context.Context, tenant string, wreqs map[endpointReplica]trackedSeries, numSeries int, seriesReplicated bool) error {
   633  	var errs writeErrors
   634  
   635  	fctx, cancel := context.WithTimeout(tracing.CopyTraceContext(context.Background(), pctx), h.options.ForwardTimeout)
   636  	defer func() {
   637  		if errs.ErrOrNil() != nil {
   638  			// NOTICE: The cancel function is not used on all paths intentionally,
   639  			// if there is no error when quorum is reached,
   640  			// let forward requests to optimistically run until timeout.
   641  			cancel()
   642  		}
   643  	}()
   644  
   645  	var tLogger log.Logger
   646  	{
   647  		logTags := []interface{}{"tenant", tenant}
   648  		if id, ok := middleware.RequestIDFromContext(pctx); ok {
   649  			logTags = append(logTags, "request-id", id)
   650  		}
   651  		tLogger = log.With(h.logger, logTags)
   652  	}
   653  
   654  	responses := make(chan writeResponse)
   655  
   656  	var wg sync.WaitGroup
   657  	for writeTarget := range wreqs {
   658  		wg.Add(1)
   659  
   660  		// If the endpoint for the write request is the
   661  		// local node, then don't make a request but store locally.
   662  		// By handing replication to the local node in the same
   663  		// function as replication to other nodes, we can treat
   664  		// a failure to write locally as just another error that
   665  		// can be ignored if the replication factor is met.
   666  		if writeTarget.endpoint == h.options.Endpoint {
   667  			go func(writeTarget endpointReplica) {
   668  				defer wg.Done()
   669  
   670  				var err error
   671  				tracing.DoInSpan(fctx, "receive_tsdb_write", func(_ context.Context) {
   672  					err = h.writer.Write(fctx, tenant, &prompb.WriteRequest{
   673  						Timeseries: wreqs[writeTarget].timeSeries,
   674  					})
   675  				})
   676  				if err != nil {
   677  					level.Debug(tLogger).Log("msg", "local tsdb write failed", "err", err.Error())
   678  					responses <- newWriteResponse(wreqs[writeTarget].seriesIDs, errors.Wrapf(err, "store locally for endpoint %v", writeTarget.endpoint))
   679  					return
   680  				}
   681  				responses <- newWriteResponse(wreqs[writeTarget].seriesIDs, nil)
   682  			}(writeTarget)
   683  
   684  			continue
   685  		}
   686  
   687  		// Make a request to the specified endpoint.
   688  		go func(writeTarget endpointReplica) {
   689  			defer wg.Done()
   690  
   691  			var (
   692  				err error
   693  				cl  storepb.WriteableStoreClient
   694  			)
   695  			defer func() {
   696  				// This is an actual remote forward request so report metric here.
   697  				if err != nil {
   698  					h.forwardRequests.WithLabelValues(labelError).Inc()
   699  					if !seriesReplicated {
   700  						h.replications.WithLabelValues(labelError).Inc()
   701  					}
   702  					return
   703  				}
   704  				h.forwardRequests.WithLabelValues(labelSuccess).Inc()
   705  				if !seriesReplicated {
   706  					h.replications.WithLabelValues(labelSuccess).Inc()
   707  				}
   708  			}()
   709  
   710  			cl, err = h.peers.get(fctx, writeTarget.endpoint)
   711  			if err != nil {
   712  				responses <- newWriteResponse(wreqs[writeTarget].seriesIDs, errors.Wrapf(err, "get peer connection for endpoint %v", writeTarget.endpoint))
   713  				return
   714  			}
   715  
   716  			h.mtx.RLock()
   717  			b, ok := h.peerStates[writeTarget.endpoint]
   718  			if ok {
   719  				if time.Now().Before(b.nextAllowed) {
   720  					h.mtx.RUnlock()
   721  					responses <- newWriteResponse(wreqs[writeTarget].seriesIDs, errors.Wrapf(errUnavailable, "backing off forward request for endpoint %v", writeTarget.endpoint))
   722  					return
   723  				}
   724  			}
   725  			h.mtx.RUnlock()
   726  
   727  			// Create a span to track the request made to another receive node.
   728  			tracing.DoInSpan(fctx, "receive_forward", func(ctx context.Context) {
   729  				// Actually make the request against the endpoint we determined should handle these time series.
   730  				_, err = cl.RemoteWrite(ctx, &storepb.WriteRequest{
   731  					Timeseries: wreqs[writeTarget].timeSeries,
   732  					Tenant:     tenant,
   733  					// Increment replica since on-the-wire format is 1-indexed and 0 indicates un-replicated.
   734  					Replica: int64(writeTarget.replica + 1),
   735  				})
   736  			})
   737  			if err != nil {
   738  				// Check if peer connection is unavailable, don't attempt to send requests constantly.
   739  				if st, ok := status.FromError(err); ok {
   740  					if st.Code() == codes.Unavailable {
   741  						h.mtx.Lock()
   742  						if b, ok := h.peerStates[writeTarget.endpoint]; ok {
   743  							b.attempt++
   744  							dur := h.expBackoff.ForAttempt(b.attempt)
   745  							b.nextAllowed = time.Now().Add(dur)
   746  							level.Debug(tLogger).Log("msg", "target unavailable backing off", "for", dur)
   747  						} else {
   748  							h.peerStates[writeTarget.endpoint] = &retryState{nextAllowed: time.Now().Add(h.expBackoff.ForAttempt(0))}
   749  						}
   750  						h.mtx.Unlock()
   751  					}
   752  				}
   753  				werr := errors.Wrapf(err, "forwarding request to endpoint %v", writeTarget.endpoint)
   754  				responses <- newWriteResponse(wreqs[writeTarget].seriesIDs, werr)
   755  				return
   756  			}
   757  			h.mtx.Lock()
   758  			delete(h.peerStates, writeTarget.endpoint)
   759  			h.mtx.Unlock()
   760  
   761  			responses <- newWriteResponse(wreqs[writeTarget].seriesIDs, nil)
   762  		}(writeTarget)
   763  	}
   764  
   765  	go func() {
   766  		wg.Wait()
   767  		close(responses)
   768  	}()
   769  
   770  	// At the end, make sure to exhaust the channel, letting remaining unnecessary requests finish asynchronously.
   771  	// This is needed if context is canceled or if we reached success of fail quorum faster.
   772  	defer func() {
   773  		go func() {
   774  			for wresp := range responses {
   775  				if wresp.err != nil {
   776  					level.Debug(tLogger).Log("msg", "request failed, but not needed to achieve quorum", "err", wresp.err)
   777  				}
   778  			}
   779  		}()
   780  	}()
   781  
   782  	quorum := h.writeQuorum()
   783  	if seriesReplicated {
   784  		quorum = 1
   785  	}
   786  	successes := make([]int, numSeries)
   787  	seriesErrs := newReplicationErrors(quorum, numSeries)
   788  	for {
   789  		select {
   790  		case <-fctx.Done():
   791  			return fctx.Err()
   792  		case wresp, more := <-responses:
   793  			if !more {
   794  				for _, rerr := range seriesErrs {
   795  					errs.Add(rerr)
   796  				}
   797  				return errs.ErrOrNil()
   798  			}
   799  
   800  			if wresp.err != nil {
   801  				for _, tsID := range wresp.seriesIDs {
   802  					seriesErrs[tsID].Add(wresp.err)
   803  				}
   804  				continue
   805  			}
   806  			for _, tsID := range wresp.seriesIDs {
   807  				successes[tsID]++
   808  			}
   809  			if quorumReached(successes, quorum) {
   810  				return nil
   811  			}
   812  		}
   813  	}
   814  }
   815  
   816  // RemoteWrite implements the gRPC remote write handler for storepb.WriteableStore.
   817  func (h *Handler) RemoteWrite(ctx context.Context, r *storepb.WriteRequest) (*storepb.WriteResponse, error) {
   818  	span, ctx := tracing.StartSpan(ctx, "receive_grpc")
   819  	defer span.Finish()
   820  
   821  	err := h.handleRequest(ctx, uint64(r.Replica), r.Tenant, &prompb.WriteRequest{Timeseries: r.Timeseries})
   822  	if err != nil {
   823  		level.Debug(h.logger).Log("msg", "failed to handle request", "err", err)
   824  	}
   825  	switch errors.Cause(err) {
   826  	case nil:
   827  		return &storepb.WriteResponse{}, nil
   828  	case errNotReady:
   829  		return nil, status.Error(codes.Unavailable, err.Error())
   830  	case errUnavailable:
   831  		return nil, status.Error(codes.Unavailable, err.Error())
   832  	case errConflict:
   833  		return nil, status.Error(codes.AlreadyExists, err.Error())
   834  	case errBadReplica:
   835  		return nil, status.Error(codes.InvalidArgument, err.Error())
   836  	default:
   837  		return nil, status.Error(codes.Internal, err.Error())
   838  	}
   839  }
   840  
   841  // relabel relabels the time series labels in the remote write request.
   842  func (h *Handler) relabel(wreq *prompb.WriteRequest) {
   843  	if len(h.options.RelabelConfigs) == 0 {
   844  		return
   845  	}
   846  	timeSeries := make([]prompb.TimeSeries, 0, len(wreq.Timeseries))
   847  	for _, ts := range wreq.Timeseries {
   848  		var keep bool
   849  		lbls, keep := relabel.Process(labelpb.ZLabelsToPromLabels(ts.Labels), h.options.RelabelConfigs...)
   850  		if !keep {
   851  			continue
   852  		}
   853  		ts.Labels = labelpb.ZLabelsFromPromLabels(lbls)
   854  		timeSeries = append(timeSeries, ts)
   855  	}
   856  	wreq.Timeseries = timeSeries
   857  }
   858  
   859  // isConflict returns whether or not the given error represents a conflict.
   860  func isConflict(err error) bool {
   861  	if err == nil {
   862  		return false
   863  	}
   864  	return err == errConflict ||
   865  		isSampleConflictErr(err) ||
   866  		isExemplarConflictErr(err) ||
   867  		isLabelsConflictErr(err) ||
   868  		status.Code(err) == codes.AlreadyExists
   869  }
   870  
   871  // isSampleConflictErr returns whether or not the given error represents
   872  // a sample-related conflict.
   873  func isSampleConflictErr(err error) bool {
   874  	return err == storage.ErrDuplicateSampleForTimestamp ||
   875  		err == storage.ErrOutOfOrderSample ||
   876  		err == storage.ErrOutOfBounds ||
   877  		err == storage.ErrTooOldSample
   878  }
   879  
   880  // isExemplarConflictErr returns whether or not the given error represents
   881  // a exemplar-related conflict.
   882  func isExemplarConflictErr(err error) bool {
   883  	return err == storage.ErrDuplicateExemplar ||
   884  		err == storage.ErrOutOfOrderExemplar ||
   885  		err == storage.ErrExemplarLabelLength
   886  }
   887  
   888  // isLabelsConflictErr returns whether or not the given error represents
   889  // a labels-related conflict.
   890  func isLabelsConflictErr(err error) bool {
   891  	return err == labelpb.ErrDuplicateLabels ||
   892  		err == labelpb.ErrEmptyLabels ||
   893  		err == labelpb.ErrOutOfOrderLabels
   894  }
   895  
   896  // isNotReady returns whether or not the given error represents a not ready error.
   897  func isNotReady(err error) bool {
   898  	return err == errNotReady ||
   899  		err == tsdb.ErrNotReady ||
   900  		status.Code(err) == codes.Unavailable
   901  }
   902  
   903  // isUnavailable returns whether or not the given error represents an unavailable error.
   904  func isUnavailable(err error) bool {
   905  	return err == errUnavailable ||
   906  		status.Code(err) == codes.Unavailable
   907  }
   908  
   909  // retryState encapsulates the number of request attempt made against a peer and,
   910  // next allowed time for the next attempt.
   911  type retryState struct {
   912  	attempt     float64
   913  	nextAllowed time.Time
   914  }
   915  
   916  type expectedErrors []*expectedError
   917  
   918  type expectedError struct {
   919  	err   error
   920  	cause func(error) bool
   921  	count int
   922  }
   923  
   924  func (a expectedErrors) Len() int           { return len(a) }
   925  func (a expectedErrors) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
   926  func (a expectedErrors) Less(i, j int) bool { return a[i].count < a[j].count }
   927  
   928  // errorSet is a set of errors.
   929  type errorSet struct {
   930  	reasonSet map[string]struct{}
   931  	errs      []error
   932  }
   933  
   934  // Error returns a string containing a deduplicated set of reasons.
   935  func (es errorSet) Error() string {
   936  	if len(es.reasonSet) == 0 {
   937  		return ""
   938  	}
   939  	reasons := make([]string, 0, len(es.reasonSet))
   940  	for reason := range es.reasonSet {
   941  		reasons = append(reasons, reason)
   942  	}
   943  	sort.Strings(reasons)
   944  
   945  	var buf bytes.Buffer
   946  	if len(reasons) > 1 {
   947  		fmt.Fprintf(&buf, "%d errors: ", len(es.reasonSet))
   948  	}
   949  
   950  	var more bool
   951  	for _, reason := range reasons {
   952  		if more {
   953  			buf.WriteString("; ")
   954  		}
   955  		buf.WriteString(reason)
   956  		more = true
   957  	}
   958  
   959  	return buf.String()
   960  }
   961  
   962  // Add adds an error to the errorSet.
   963  func (es *errorSet) Add(err error) {
   964  	if err == nil {
   965  		return
   966  	}
   967  
   968  	if len(es.errs) == 0 {
   969  		es.errs = []error{err}
   970  	} else {
   971  		es.errs = append(es.errs, err)
   972  	}
   973  	if es.reasonSet == nil {
   974  		es.reasonSet = make(map[string]struct{})
   975  	}
   976  
   977  	switch addedErr := err.(type) {
   978  	case *replicationErrors:
   979  		for reason := range addedErr.reasonSet {
   980  			es.reasonSet[reason] = struct{}{}
   981  		}
   982  	case *writeErrors:
   983  		for reason := range addedErr.reasonSet {
   984  			es.reasonSet[reason] = struct{}{}
   985  		}
   986  	default:
   987  		es.reasonSet[err.Error()] = struct{}{}
   988  	}
   989  }
   990  
   991  // writeErrors contains all errors that have
   992  // occurred during a local write of a remote-write request.
   993  type writeErrors struct {
   994  	errorSet
   995  }
   996  
   997  // ErrOrNil returns the writeErrors instance if any
   998  // errors are contained in it.
   999  // Otherwise, it returns nil.
  1000  func (es *writeErrors) ErrOrNil() error {
  1001  	if len(es.errs) == 0 {
  1002  		return nil
  1003  	}
  1004  	return es
  1005  }
  1006  
  1007  // Cause returns the primary cause for a write failure.
  1008  // If multiple errors have occurred, Cause will prefer
  1009  // recoverable over non-recoverable errors.
  1010  func (es *writeErrors) Cause() error {
  1011  	if len(es.errs) == 0 {
  1012  		return nil
  1013  	}
  1014  
  1015  	expErrs := expectedErrors{
  1016  		{err: errUnavailable, cause: isUnavailable},
  1017  		{err: errNotReady, cause: isNotReady},
  1018  		{err: errConflict, cause: isConflict},
  1019  	}
  1020  
  1021  	var (
  1022  		unknownErr error
  1023  		knownCause bool
  1024  	)
  1025  	for _, werr := range es.errs {
  1026  		knownCause = false
  1027  		cause := errors.Cause(werr)
  1028  		for _, exp := range expErrs {
  1029  			if exp.cause(cause) {
  1030  				knownCause = true
  1031  				exp.count++
  1032  			}
  1033  		}
  1034  		if !knownCause {
  1035  			unknownErr = cause
  1036  		}
  1037  	}
  1038  
  1039  	for _, exp := range expErrs {
  1040  		if exp.count > 0 {
  1041  			return exp.err
  1042  		}
  1043  	}
  1044  
  1045  	return unknownErr
  1046  }
  1047  
  1048  // replicationErrors contains errors that have happened while
  1049  // replicating a time series within a remote-write request.
  1050  type replicationErrors struct {
  1051  	errorSet
  1052  	threshold int
  1053  }
  1054  
  1055  // Cause extracts a sentinel error with the highest occurrence that
  1056  // has happened more than the given threshold.
  1057  // If no single error has occurred more than the threshold, but the
  1058  // total number of errors meets the threshold,
  1059  // replicationErr will return errInternal.
  1060  func (es *replicationErrors) Cause() error {
  1061  	if len(es.errs) == 0 {
  1062  		return errorSet{}
  1063  	}
  1064  
  1065  	expErrs := expectedErrors{
  1066  		{err: errConflict, cause: isConflict},
  1067  		{err: errNotReady, cause: isNotReady},
  1068  		{err: errUnavailable, cause: isUnavailable},
  1069  	}
  1070  	for _, exp := range expErrs {
  1071  		exp.count = 0
  1072  		for _, err := range es.errs {
  1073  			if exp.cause(errors.Cause(err)) {
  1074  				exp.count++
  1075  			}
  1076  		}
  1077  	}
  1078  
  1079  	// Determine which error occurred most.
  1080  	sort.Sort(sort.Reverse(expErrs))
  1081  	if exp := expErrs[0]; exp.count >= es.threshold {
  1082  		return exp.err
  1083  	}
  1084  
  1085  	if len(es.errs) >= es.threshold {
  1086  		return errInternal
  1087  	}
  1088  
  1089  	return nil
  1090  }
  1091  
  1092  func newReplicationErrors(threshold, numErrors int) []*replicationErrors {
  1093  	errs := make([]*replicationErrors, numErrors)
  1094  	for i := range errs {
  1095  		errs[i] = &replicationErrors{threshold: threshold}
  1096  	}
  1097  	return errs
  1098  }
  1099  
  1100  func newPeerGroup(dialOpts ...grpc.DialOption) *peerGroup {
  1101  	return &peerGroup{
  1102  		dialOpts: dialOpts,
  1103  		cache:    map[string]storepb.WriteableStoreClient{},
  1104  		m:        sync.RWMutex{},
  1105  		dialer:   grpc.DialContext,
  1106  	}
  1107  }
  1108  
  1109  type peerGroup struct {
  1110  	dialOpts []grpc.DialOption
  1111  	cache    map[string]storepb.WriteableStoreClient
  1112  	m        sync.RWMutex
  1113  
  1114  	// dialer is used for testing.
  1115  	dialer func(ctx context.Context, target string, opts ...grpc.DialOption) (conn *grpc.ClientConn, err error)
  1116  }
  1117  
  1118  func (p *peerGroup) get(ctx context.Context, addr string) (storepb.WriteableStoreClient, error) {
  1119  	// use a RLock first to prevent blocking if we don't need to.
  1120  	p.m.RLock()
  1121  	c, ok := p.cache[addr]
  1122  	p.m.RUnlock()
  1123  	if ok {
  1124  		return c, nil
  1125  	}
  1126  
  1127  	p.m.Lock()
  1128  	defer p.m.Unlock()
  1129  	// Make sure that another caller hasn't created the connection since obtaining the write lock.
  1130  	c, ok = p.cache[addr]
  1131  	if ok {
  1132  		return c, nil
  1133  	}
  1134  	conn, err := p.dialer(ctx, addr, p.dialOpts...)
  1135  	if err != nil {
  1136  		return nil, errors.Wrap(err, "failed to dial peer")
  1137  	}
  1138  
  1139  	client := storepb.NewWriteableStoreClient(conn)
  1140  	p.cache[addr] = client
  1141  	return client, nil
  1142  }