github.com/thanos-io/thanos@v0.32.5/pkg/receive/handler.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 package receive 5 6 import ( 7 "bytes" 8 "context" 9 "crypto/tls" 10 "fmt" 11 "io" 12 stdlog "log" 13 "math" 14 "net" 15 "net/http" 16 "sort" 17 "strconv" 18 "sync" 19 "time" 20 21 "github.com/go-kit/log" 22 "github.com/go-kit/log/level" 23 "github.com/gogo/protobuf/proto" 24 "github.com/jpillora/backoff" 25 "github.com/klauspost/compress/s2" 26 "github.com/mwitkow/go-conntrack" 27 "github.com/opentracing/opentracing-go" 28 "github.com/pkg/errors" 29 "github.com/prometheus/client_golang/prometheus" 30 "github.com/prometheus/client_golang/prometheus/promauto" 31 "github.com/prometheus/common/route" 32 "github.com/prometheus/prometheus/model/relabel" 33 "github.com/prometheus/prometheus/storage" 34 "github.com/prometheus/prometheus/tsdb" 35 "google.golang.org/grpc" 36 "google.golang.org/grpc/codes" 37 "google.golang.org/grpc/status" 38 39 "github.com/thanos-io/thanos/pkg/api" 40 statusapi "github.com/thanos-io/thanos/pkg/api/status" 41 "github.com/thanos-io/thanos/pkg/logging" 42 43 extpromhttp "github.com/thanos-io/thanos/pkg/extprom/http" 44 "github.com/thanos-io/thanos/pkg/runutil" 45 "github.com/thanos-io/thanos/pkg/server/http/middleware" 46 "github.com/thanos-io/thanos/pkg/store/labelpb" 47 "github.com/thanos-io/thanos/pkg/store/storepb" 48 "github.com/thanos-io/thanos/pkg/store/storepb/prompb" 49 "github.com/thanos-io/thanos/pkg/tenancy" 50 "github.com/thanos-io/thanos/pkg/tracing" 51 ) 52 53 const ( 54 // DefaultStatsLimit is the default value used for limiting tenant stats. 55 DefaultStatsLimit = 10 56 // DefaultReplicaHeader is the default header used to designate the replica count of a write request. 57 DefaultReplicaHeader = "THANOS-REPLICA" 58 // AllTenantsQueryParam is the query parameter for getting TSDB stats for all tenants. 59 AllTenantsQueryParam = "all_tenants" 60 // LimitStatsQueryParam is the query parameter for limiting the amount of returned TSDB stats. 61 LimitStatsQueryParam = "limit" 62 // Labels for metrics. 63 labelSuccess = "success" 64 labelError = "error" 65 ) 66 67 var ( 68 // errConflict is returned whenever an operation fails due to any conflict-type error. 69 errConflict = errors.New("conflict") 70 71 errBadReplica = errors.New("request replica exceeds receiver replication factor") 72 errNotReady = errors.New("target not ready") 73 errUnavailable = errors.New("target not available") 74 errInternal = errors.New("internal error") 75 ) 76 77 // Options for the web Handler. 78 type Options struct { 79 Writer *Writer 80 ListenAddress string 81 Registry *prometheus.Registry 82 TenantHeader string 83 TenantField string 84 DefaultTenantID string 85 ReplicaHeader string 86 Endpoint string 87 ReplicationFactor uint64 88 ReceiverMode ReceiverMode 89 Tracer opentracing.Tracer 90 TLSConfig *tls.Config 91 DialOpts []grpc.DialOption 92 ForwardTimeout time.Duration 93 MaxBackoff time.Duration 94 RelabelConfigs []*relabel.Config 95 TSDBStats TSDBStats 96 Limiter *Limiter 97 } 98 99 // Handler serves a Prometheus remote write receiving HTTP endpoint. 100 type Handler struct { 101 logger log.Logger 102 writer *Writer 103 router *route.Router 104 options *Options 105 listener net.Listener 106 107 mtx sync.RWMutex 108 hashring Hashring 109 peers *peerGroup 110 expBackoff backoff.Backoff 111 peerStates map[string]*retryState 112 receiverMode ReceiverMode 113 114 forwardRequests *prometheus.CounterVec 115 replications *prometheus.CounterVec 116 replicationFactor prometheus.Gauge 117 118 writeSamplesTotal *prometheus.HistogramVec 119 writeTimeseriesTotal *prometheus.HistogramVec 120 121 Limiter *Limiter 122 } 123 124 func NewHandler(logger log.Logger, o *Options) *Handler { 125 if logger == nil { 126 logger = log.NewNopLogger() 127 } 128 129 var registerer prometheus.Registerer = nil 130 if o.Registry != nil { 131 registerer = o.Registry 132 } 133 134 h := &Handler{ 135 logger: logger, 136 writer: o.Writer, 137 router: route.New(), 138 options: o, 139 peers: newPeerGroup(o.DialOpts...), 140 receiverMode: o.ReceiverMode, 141 expBackoff: backoff.Backoff{ 142 Factor: 2, 143 Min: 100 * time.Millisecond, 144 Max: o.MaxBackoff, 145 Jitter: true, 146 }, 147 Limiter: o.Limiter, 148 forwardRequests: promauto.With(registerer).NewCounterVec( 149 prometheus.CounterOpts{ 150 Name: "thanos_receive_forward_requests_total", 151 Help: "The number of forward requests.", 152 }, []string{"result"}, 153 ), 154 replications: promauto.With(registerer).NewCounterVec( 155 prometheus.CounterOpts{ 156 Name: "thanos_receive_replications_total", 157 Help: "The number of replication operations done by the receiver. The success of replication is fulfilled when a quorum is met.", 158 }, []string{"result"}, 159 ), 160 replicationFactor: promauto.With(registerer).NewGauge( 161 prometheus.GaugeOpts{ 162 Name: "thanos_receive_replication_factor", 163 Help: "The number of times to replicate incoming write requests.", 164 }, 165 ), 166 writeTimeseriesTotal: promauto.With(registerer).NewHistogramVec( 167 prometheus.HistogramOpts{ 168 Namespace: "thanos", 169 Subsystem: "receive", 170 Name: "write_timeseries", 171 Help: "The number of timeseries received in the incoming write requests.", 172 Buckets: []float64{10, 50, 100, 500, 1000, 5000, 10000}, 173 }, []string{"code", "tenant"}, 174 ), 175 writeSamplesTotal: promauto.With(registerer).NewHistogramVec( 176 prometheus.HistogramOpts{ 177 Namespace: "thanos", 178 Subsystem: "receive", 179 Name: "write_samples", 180 Help: "The number of sampled received in the incoming write requests.", 181 Buckets: []float64{10, 50, 100, 500, 1000, 5000, 10000}, 182 }, []string{"code", "tenant"}, 183 ), 184 } 185 186 h.forwardRequests.WithLabelValues(labelSuccess) 187 h.forwardRequests.WithLabelValues(labelError) 188 h.replications.WithLabelValues(labelSuccess) 189 h.replications.WithLabelValues(labelError) 190 191 if o.ReplicationFactor > 1 { 192 h.replicationFactor.Set(float64(o.ReplicationFactor)) 193 } else { 194 h.replicationFactor.Set(1) 195 } 196 197 ins := extpromhttp.NewNopInstrumentationMiddleware() 198 if o.Registry != nil { 199 ins = extpromhttp.NewTenantInstrumentationMiddleware( 200 o.TenantHeader, 201 o.Registry, 202 []float64{0.001, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.25, 0.5, 0.75, 1, 2, 3, 4, 5}, 203 ) 204 } 205 206 readyf := h.testReady 207 instrf := func(name string, next func(w http.ResponseWriter, r *http.Request)) http.HandlerFunc { 208 next = ins.NewHandler(name, http.HandlerFunc(next)) 209 210 if o.Tracer != nil { 211 next = tracing.HTTPMiddleware(o.Tracer, name, logger, http.HandlerFunc(next)) 212 } 213 return next 214 } 215 216 h.router.Post( 217 "/api/v1/receive", 218 instrf( 219 "receive", 220 readyf( 221 middleware.RequestID( 222 http.HandlerFunc(h.receiveHTTP), 223 ), 224 ), 225 ), 226 ) 227 228 statusAPI := statusapi.New(statusapi.Options{ 229 GetStats: h.getStats, 230 Registry: h.options.Registry, 231 }) 232 statusAPI.Register(h.router, o.Tracer, logger, ins, logging.NewHTTPServerMiddleware(logger)) 233 234 return h 235 } 236 237 // Hashring sets the hashring for the handler and marks the hashring as ready. 238 // The hashring must be set to a non-nil value in order for the 239 // handler to be ready and usable. 240 // If the hashring is nil, then the handler is marked as not ready. 241 func (h *Handler) Hashring(hashring Hashring) { 242 h.mtx.Lock() 243 defer h.mtx.Unlock() 244 245 h.hashring = hashring 246 h.expBackoff.Reset() 247 h.peerStates = make(map[string]*retryState) 248 } 249 250 // Verifies whether the server is ready or not. 251 func (h *Handler) isReady() bool { 252 h.mtx.RLock() 253 hr := h.hashring != nil 254 sr := h.writer != nil 255 h.mtx.RUnlock() 256 return sr && hr 257 } 258 259 // Checks if server is ready, calls f if it is, returns 503 if it is not. 260 func (h *Handler) testReady(f http.HandlerFunc) http.HandlerFunc { 261 return func(w http.ResponseWriter, r *http.Request) { 262 if h.isReady() { 263 f(w, r) 264 return 265 } 266 267 w.WriteHeader(http.StatusServiceUnavailable) 268 _, err := fmt.Fprintf(w, "Service Unavailable") 269 if err != nil { 270 h.logger.Log("msg", "failed to write to response body", "err", err) 271 } 272 } 273 } 274 275 func getStatsLimitParameter(r *http.Request) (int, error) { 276 statsLimitStr := r.URL.Query().Get(LimitStatsQueryParam) 277 if statsLimitStr == "" { 278 return DefaultStatsLimit, nil 279 } 280 statsLimit, err := strconv.ParseInt(statsLimitStr, 10, 0) 281 if err != nil { 282 return 0, fmt.Errorf("unable to parse '%s' parameter: %w", LimitStatsQueryParam, err) 283 } 284 if statsLimit > math.MaxInt { 285 return 0, fmt.Errorf("'%s' parameter is larger than %d", LimitStatsQueryParam, math.MaxInt) 286 } 287 return int(statsLimit), nil 288 } 289 290 func (h *Handler) getStats(r *http.Request, statsByLabelName string) ([]statusapi.TenantStats, *api.ApiError) { 291 if !h.isReady() { 292 return nil, &api.ApiError{Typ: api.ErrorInternal, Err: fmt.Errorf("service unavailable")} 293 } 294 295 tenantID := r.Header.Get(h.options.TenantHeader) 296 getAllTenantStats := r.FormValue(AllTenantsQueryParam) == "true" 297 if getAllTenantStats && tenantID != "" { 298 err := fmt.Errorf("using both the %s parameter and the %s header is not supported", AllTenantsQueryParam, h.options.TenantHeader) 299 return nil, &api.ApiError{Typ: api.ErrorBadData, Err: err} 300 } 301 302 statsLimit, err := getStatsLimitParameter(r) 303 if err != nil { 304 return nil, &api.ApiError{Typ: api.ErrorBadData, Err: err} 305 } 306 307 if getAllTenantStats { 308 return h.options.TSDBStats.TenantStats(statsLimit, statsByLabelName), nil 309 } 310 311 if tenantID == "" { 312 tenantID = h.options.DefaultTenantID 313 } 314 315 return h.options.TSDBStats.TenantStats(statsLimit, statsByLabelName, tenantID), nil 316 } 317 318 // Close stops the Handler. 319 func (h *Handler) Close() { 320 if h.listener != nil { 321 runutil.CloseWithLogOnErr(h.logger, h.listener, "receive HTTP listener") 322 } 323 } 324 325 // Run serves the HTTP endpoints. 326 func (h *Handler) Run() error { 327 level.Info(h.logger).Log("msg", "Start listening for connections", "address", h.options.ListenAddress) 328 329 var err error 330 h.listener, err = net.Listen("tcp", h.options.ListenAddress) 331 if err != nil { 332 return err 333 } 334 335 // Monitor incoming connections with conntrack. 336 h.listener = conntrack.NewListener(h.listener, 337 conntrack.TrackWithName("http"), 338 conntrack.TrackWithTracing()) 339 340 errlog := stdlog.New(log.NewStdlibAdapter(level.Error(h.logger)), "", 0) 341 342 httpSrv := &http.Server{ 343 Handler: h.router, 344 ErrorLog: errlog, 345 TLSConfig: h.options.TLSConfig, 346 } 347 348 if h.options.TLSConfig != nil { 349 level.Info(h.logger).Log("msg", "Serving HTTPS", "address", h.options.ListenAddress) 350 // Cert & Key are already being passed in via TLSConfig. 351 return httpSrv.ServeTLS(h.listener, "", "") 352 } 353 354 level.Info(h.logger).Log("msg", "Serving plain HTTP", "address", h.options.ListenAddress) 355 return httpSrv.Serve(h.listener) 356 } 357 358 // replica encapsulates the replica number of a request and if the request is 359 // already replicated. 360 type replica struct { 361 n uint64 362 replicated bool 363 } 364 365 // endpointReplica is a pair of a receive endpoint and a write request replica. 366 type endpointReplica struct { 367 endpoint string 368 replica uint64 369 } 370 371 type trackedSeries struct { 372 seriesIDs []int 373 timeSeries []prompb.TimeSeries 374 } 375 376 type writeResponse struct { 377 seriesIDs []int 378 err error 379 } 380 381 func newWriteResponse(seriesIDs []int, err error) writeResponse { 382 return writeResponse{ 383 seriesIDs: seriesIDs, 384 err: err, 385 } 386 } 387 388 func (h *Handler) handleRequest(ctx context.Context, rep uint64, tenant string, wreq *prompb.WriteRequest) error { 389 tLogger := log.With(h.logger, "tenant", tenant) 390 391 // This replica value is used to detect cycles in cyclic topologies. 392 // A non-zero value indicates that the request has already been replicated by a previous receive instance. 393 // For almost all users, this is only used in fully connected topologies of IngestorRouter instances. 394 // For acyclic topologies that use RouterOnly and IngestorOnly instances, this causes issues when replicating data. 395 // See discussion in: https://github.com/thanos-io/thanos/issues/4359. 396 if h.receiverMode == RouterOnly || h.receiverMode == IngestorOnly { 397 rep = 0 398 } 399 400 // The replica value in the header is one-indexed, thus we need >. 401 if rep > h.options.ReplicationFactor { 402 level.Error(tLogger).Log("err", errBadReplica, "msg", "write request rejected", 403 "request_replica", rep, "replication_factor", h.options.ReplicationFactor) 404 return errBadReplica 405 } 406 407 r := replica{n: rep, replicated: rep != 0} 408 409 // On the wire, format is 1-indexed and in-code is 0-indexed, so we decrement the value if it was already replicated. 410 if r.replicated { 411 r.n-- 412 } 413 414 // Forward any time series as necessary. All time series 415 // destined for the local node will be written to the receiver. 416 // Time series will be replicated as necessary. 417 return h.forward(ctx, tenant, r, wreq) 418 } 419 420 func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) { 421 var err error 422 span, ctx := tracing.StartSpan(r.Context(), "receive_http") 423 defer span.Finish() 424 425 tenant, err := tenancy.GetTenantFromHTTP(r, h.options.TenantHeader, h.options.DefaultTenantID, h.options.TenantField) 426 if err != nil { 427 level.Error(h.logger).Log("msg", "error getting tenant from HTTP", "err", err) 428 http.Error(w, err.Error(), http.StatusBadRequest) 429 return 430 } 431 432 tLogger := log.With(h.logger, "tenant", tenant) 433 434 writeGate := h.Limiter.WriteGate() 435 tracing.DoInSpan(r.Context(), "receive_write_gate_ismyturn", func(ctx context.Context) { 436 err = writeGate.Start(r.Context()) 437 }) 438 defer writeGate.Done() 439 if err != nil { 440 level.Error(tLogger).Log("err", err, "msg", "internal server error") 441 http.Error(w, err.Error(), http.StatusInternalServerError) 442 return 443 } 444 445 under, err := h.Limiter.HeadSeriesLimiter.isUnderLimit(tenant) 446 if err != nil { 447 level.Error(tLogger).Log("msg", "error while limiting", "err", err.Error()) 448 } 449 450 // Fail request fully if tenant has exceeded set limit. 451 if !under { 452 http.Error(w, "tenant is above active series limit", http.StatusTooManyRequests) 453 return 454 } 455 456 requestLimiter := h.Limiter.RequestLimiter() 457 // io.ReadAll dynamically adjust the byte slice for read data, starting from 512B. 458 // Since this is receive hot path, grow upfront saving allocations and CPU time. 459 compressed := bytes.Buffer{} 460 if r.ContentLength >= 0 { 461 if !requestLimiter.AllowSizeBytes(tenant, r.ContentLength) { 462 http.Error(w, "write request too large", http.StatusRequestEntityTooLarge) 463 return 464 } 465 compressed.Grow(int(r.ContentLength)) 466 } else { 467 compressed.Grow(512) 468 } 469 _, err = io.Copy(&compressed, r.Body) 470 if err != nil { 471 http.Error(w, errors.Wrap(err, "read compressed request body").Error(), http.StatusInternalServerError) 472 return 473 } 474 reqBuf, err := s2.Decode(nil, compressed.Bytes()) 475 if err != nil { 476 level.Error(tLogger).Log("msg", "snappy decode error", "err", err) 477 http.Error(w, errors.Wrap(err, "snappy decode error").Error(), http.StatusBadRequest) 478 return 479 } 480 481 if !requestLimiter.AllowSizeBytes(tenant, int64(len(reqBuf))) { 482 http.Error(w, "write request too large", http.StatusRequestEntityTooLarge) 483 return 484 } 485 486 // NOTE: Due to zero copy ZLabels, Labels used from WriteRequests keeps memory 487 // from the whole request. Ensure that we always copy those when we want to 488 // store them for longer time. 489 var wreq prompb.WriteRequest 490 if err := proto.Unmarshal(reqBuf, &wreq); err != nil { 491 http.Error(w, err.Error(), http.StatusBadRequest) 492 return 493 } 494 495 rep := uint64(0) 496 // If the header is empty, we assume the request is not yet replicated. 497 if replicaRaw := r.Header.Get(h.options.ReplicaHeader); replicaRaw != "" { 498 if rep, err = strconv.ParseUint(replicaRaw, 10, 64); err != nil { 499 http.Error(w, "could not parse replica header", http.StatusBadRequest) 500 return 501 } 502 } 503 504 // Exit early if the request contained no data. We don't support metadata yet. We also cannot fail here, because 505 // this would mean lack of forward compatibility for remote write proto. 506 if len(wreq.Timeseries) == 0 { 507 // TODO(yeya24): Handle remote write metadata. 508 if len(wreq.Metadata) > 0 { 509 // TODO(bwplotka): Do we need this error message? 510 level.Debug(tLogger).Log("msg", "only metadata from client; metadata ingestion not supported; skipping") 511 return 512 } 513 level.Debug(tLogger).Log("msg", "empty remote write request; client bug or newer remote write protocol used?; skipping") 514 return 515 } 516 517 if !requestLimiter.AllowSeries(tenant, int64(len(wreq.Timeseries))) { 518 http.Error(w, "too many timeseries", http.StatusRequestEntityTooLarge) 519 return 520 } 521 522 totalSamples := 0 523 for _, timeseries := range wreq.Timeseries { 524 totalSamples += len(timeseries.Samples) 525 } 526 if !requestLimiter.AllowSamples(tenant, int64(totalSamples)) { 527 http.Error(w, "too many samples", http.StatusRequestEntityTooLarge) 528 return 529 } 530 531 // Apply relabeling configs. 532 h.relabel(&wreq) 533 if len(wreq.Timeseries) == 0 { 534 level.Debug(tLogger).Log("msg", "remote write request dropped due to relabeling.") 535 return 536 } 537 538 responseStatusCode := http.StatusOK 539 if err = h.handleRequest(ctx, rep, tenant, &wreq); err != nil { 540 level.Debug(tLogger).Log("msg", "failed to handle request", "err", err) 541 switch errors.Cause(err) { 542 case errNotReady: 543 responseStatusCode = http.StatusServiceUnavailable 544 case errUnavailable: 545 responseStatusCode = http.StatusServiceUnavailable 546 case errConflict: 547 responseStatusCode = http.StatusConflict 548 case errBadReplica: 549 responseStatusCode = http.StatusBadRequest 550 default: 551 level.Error(tLogger).Log("err", err, "msg", "internal server error") 552 responseStatusCode = http.StatusInternalServerError 553 } 554 http.Error(w, err.Error(), responseStatusCode) 555 } 556 h.writeTimeseriesTotal.WithLabelValues(strconv.Itoa(responseStatusCode), tenant).Observe(float64(len(wreq.Timeseries))) 557 h.writeSamplesTotal.WithLabelValues(strconv.Itoa(responseStatusCode), tenant).Observe(float64(totalSamples)) 558 } 559 560 // forward accepts a write request, batches its time series by 561 // corresponding endpoint, and forwards them in parallel to the 562 // correct endpoint. Requests destined for the local node are written 563 // the local receiver. For a given write request, at most one outgoing 564 // write request will be made to every other node in the hashring, 565 // unless the request needs to be replicated. 566 // The function only returns when all requests have finished 567 // or the context is canceled. 568 func (h *Handler) forward(ctx context.Context, tenant string, r replica, wreq *prompb.WriteRequest) error { 569 span, ctx := tracing.StartSpan(ctx, "receive_fanout_forward") 570 defer span.Finish() 571 572 // It is possible that hashring is ready in testReady() but unready now, 573 // so need to lock here. 574 h.mtx.RLock() 575 if h.hashring == nil { 576 h.mtx.RUnlock() 577 return errors.New("hashring is not ready") 578 } 579 580 var replicas []uint64 581 if r.replicated { 582 replicas = []uint64{r.n} 583 } else { 584 for rn := uint64(0); rn < h.options.ReplicationFactor; rn++ { 585 replicas = append(replicas, rn) 586 } 587 } 588 589 wreqs := make(map[endpointReplica]trackedSeries) 590 for tsID, ts := range wreq.Timeseries { 591 for _, rn := range replicas { 592 endpoint, err := h.hashring.GetN(tenant, &ts, rn) 593 if err != nil { 594 h.mtx.RUnlock() 595 return err 596 } 597 key := endpointReplica{endpoint: endpoint, replica: rn} 598 writeTarget, ok := wreqs[key] 599 if !ok { 600 writeTarget = trackedSeries{ 601 seriesIDs: make([]int, 0), 602 timeSeries: make([]prompb.TimeSeries, 0), 603 } 604 } 605 writeTarget.timeSeries = append(wreqs[key].timeSeries, ts) 606 writeTarget.seriesIDs = append(wreqs[key].seriesIDs, tsID) 607 wreqs[key] = writeTarget 608 } 609 } 610 h.mtx.RUnlock() 611 612 return h.fanoutForward(ctx, tenant, wreqs, len(wreq.Timeseries), r.replicated) 613 } 614 615 // writeQuorum returns minimum number of replicas that has to confirm write success before claiming replication success. 616 func (h *Handler) writeQuorum() int { 617 return int((h.options.ReplicationFactor / 2) + 1) 618 } 619 620 func quorumReached(successes []int, successThreshold int) bool { 621 for _, success := range successes { 622 if success < successThreshold { 623 return false 624 } 625 } 626 627 return true 628 } 629 630 // fanoutForward fans out concurrently given set of write requests. It returns status immediately when quorum of 631 // requests succeeds or fails or if context is canceled. 632 func (h *Handler) fanoutForward(pctx context.Context, tenant string, wreqs map[endpointReplica]trackedSeries, numSeries int, seriesReplicated bool) error { 633 var errs writeErrors 634 635 fctx, cancel := context.WithTimeout(tracing.CopyTraceContext(context.Background(), pctx), h.options.ForwardTimeout) 636 defer func() { 637 if errs.ErrOrNil() != nil { 638 // NOTICE: The cancel function is not used on all paths intentionally, 639 // if there is no error when quorum is reached, 640 // let forward requests to optimistically run until timeout. 641 cancel() 642 } 643 }() 644 645 var tLogger log.Logger 646 { 647 logTags := []interface{}{"tenant", tenant} 648 if id, ok := middleware.RequestIDFromContext(pctx); ok { 649 logTags = append(logTags, "request-id", id) 650 } 651 tLogger = log.With(h.logger, logTags) 652 } 653 654 responses := make(chan writeResponse) 655 656 var wg sync.WaitGroup 657 for writeTarget := range wreqs { 658 wg.Add(1) 659 660 // If the endpoint for the write request is the 661 // local node, then don't make a request but store locally. 662 // By handing replication to the local node in the same 663 // function as replication to other nodes, we can treat 664 // a failure to write locally as just another error that 665 // can be ignored if the replication factor is met. 666 if writeTarget.endpoint == h.options.Endpoint { 667 go func(writeTarget endpointReplica) { 668 defer wg.Done() 669 670 var err error 671 tracing.DoInSpan(fctx, "receive_tsdb_write", func(_ context.Context) { 672 err = h.writer.Write(fctx, tenant, &prompb.WriteRequest{ 673 Timeseries: wreqs[writeTarget].timeSeries, 674 }) 675 }) 676 if err != nil { 677 level.Debug(tLogger).Log("msg", "local tsdb write failed", "err", err.Error()) 678 responses <- newWriteResponse(wreqs[writeTarget].seriesIDs, errors.Wrapf(err, "store locally for endpoint %v", writeTarget.endpoint)) 679 return 680 } 681 responses <- newWriteResponse(wreqs[writeTarget].seriesIDs, nil) 682 }(writeTarget) 683 684 continue 685 } 686 687 // Make a request to the specified endpoint. 688 go func(writeTarget endpointReplica) { 689 defer wg.Done() 690 691 var ( 692 err error 693 cl storepb.WriteableStoreClient 694 ) 695 defer func() { 696 // This is an actual remote forward request so report metric here. 697 if err != nil { 698 h.forwardRequests.WithLabelValues(labelError).Inc() 699 if !seriesReplicated { 700 h.replications.WithLabelValues(labelError).Inc() 701 } 702 return 703 } 704 h.forwardRequests.WithLabelValues(labelSuccess).Inc() 705 if !seriesReplicated { 706 h.replications.WithLabelValues(labelSuccess).Inc() 707 } 708 }() 709 710 cl, err = h.peers.get(fctx, writeTarget.endpoint) 711 if err != nil { 712 responses <- newWriteResponse(wreqs[writeTarget].seriesIDs, errors.Wrapf(err, "get peer connection for endpoint %v", writeTarget.endpoint)) 713 return 714 } 715 716 h.mtx.RLock() 717 b, ok := h.peerStates[writeTarget.endpoint] 718 if ok { 719 if time.Now().Before(b.nextAllowed) { 720 h.mtx.RUnlock() 721 responses <- newWriteResponse(wreqs[writeTarget].seriesIDs, errors.Wrapf(errUnavailable, "backing off forward request for endpoint %v", writeTarget.endpoint)) 722 return 723 } 724 } 725 h.mtx.RUnlock() 726 727 // Create a span to track the request made to another receive node. 728 tracing.DoInSpan(fctx, "receive_forward", func(ctx context.Context) { 729 // Actually make the request against the endpoint we determined should handle these time series. 730 _, err = cl.RemoteWrite(ctx, &storepb.WriteRequest{ 731 Timeseries: wreqs[writeTarget].timeSeries, 732 Tenant: tenant, 733 // Increment replica since on-the-wire format is 1-indexed and 0 indicates un-replicated. 734 Replica: int64(writeTarget.replica + 1), 735 }) 736 }) 737 if err != nil { 738 // Check if peer connection is unavailable, don't attempt to send requests constantly. 739 if st, ok := status.FromError(err); ok { 740 if st.Code() == codes.Unavailable { 741 h.mtx.Lock() 742 if b, ok := h.peerStates[writeTarget.endpoint]; ok { 743 b.attempt++ 744 dur := h.expBackoff.ForAttempt(b.attempt) 745 b.nextAllowed = time.Now().Add(dur) 746 level.Debug(tLogger).Log("msg", "target unavailable backing off", "for", dur) 747 } else { 748 h.peerStates[writeTarget.endpoint] = &retryState{nextAllowed: time.Now().Add(h.expBackoff.ForAttempt(0))} 749 } 750 h.mtx.Unlock() 751 } 752 } 753 werr := errors.Wrapf(err, "forwarding request to endpoint %v", writeTarget.endpoint) 754 responses <- newWriteResponse(wreqs[writeTarget].seriesIDs, werr) 755 return 756 } 757 h.mtx.Lock() 758 delete(h.peerStates, writeTarget.endpoint) 759 h.mtx.Unlock() 760 761 responses <- newWriteResponse(wreqs[writeTarget].seriesIDs, nil) 762 }(writeTarget) 763 } 764 765 go func() { 766 wg.Wait() 767 close(responses) 768 }() 769 770 // At the end, make sure to exhaust the channel, letting remaining unnecessary requests finish asynchronously. 771 // This is needed if context is canceled or if we reached success of fail quorum faster. 772 defer func() { 773 go func() { 774 for wresp := range responses { 775 if wresp.err != nil { 776 level.Debug(tLogger).Log("msg", "request failed, but not needed to achieve quorum", "err", wresp.err) 777 } 778 } 779 }() 780 }() 781 782 quorum := h.writeQuorum() 783 if seriesReplicated { 784 quorum = 1 785 } 786 successes := make([]int, numSeries) 787 seriesErrs := newReplicationErrors(quorum, numSeries) 788 for { 789 select { 790 case <-fctx.Done(): 791 return fctx.Err() 792 case wresp, more := <-responses: 793 if !more { 794 for _, rerr := range seriesErrs { 795 errs.Add(rerr) 796 } 797 return errs.ErrOrNil() 798 } 799 800 if wresp.err != nil { 801 for _, tsID := range wresp.seriesIDs { 802 seriesErrs[tsID].Add(wresp.err) 803 } 804 continue 805 } 806 for _, tsID := range wresp.seriesIDs { 807 successes[tsID]++ 808 } 809 if quorumReached(successes, quorum) { 810 return nil 811 } 812 } 813 } 814 } 815 816 // RemoteWrite implements the gRPC remote write handler for storepb.WriteableStore. 817 func (h *Handler) RemoteWrite(ctx context.Context, r *storepb.WriteRequest) (*storepb.WriteResponse, error) { 818 span, ctx := tracing.StartSpan(ctx, "receive_grpc") 819 defer span.Finish() 820 821 err := h.handleRequest(ctx, uint64(r.Replica), r.Tenant, &prompb.WriteRequest{Timeseries: r.Timeseries}) 822 if err != nil { 823 level.Debug(h.logger).Log("msg", "failed to handle request", "err", err) 824 } 825 switch errors.Cause(err) { 826 case nil: 827 return &storepb.WriteResponse{}, nil 828 case errNotReady: 829 return nil, status.Error(codes.Unavailable, err.Error()) 830 case errUnavailable: 831 return nil, status.Error(codes.Unavailable, err.Error()) 832 case errConflict: 833 return nil, status.Error(codes.AlreadyExists, err.Error()) 834 case errBadReplica: 835 return nil, status.Error(codes.InvalidArgument, err.Error()) 836 default: 837 return nil, status.Error(codes.Internal, err.Error()) 838 } 839 } 840 841 // relabel relabels the time series labels in the remote write request. 842 func (h *Handler) relabel(wreq *prompb.WriteRequest) { 843 if len(h.options.RelabelConfigs) == 0 { 844 return 845 } 846 timeSeries := make([]prompb.TimeSeries, 0, len(wreq.Timeseries)) 847 for _, ts := range wreq.Timeseries { 848 var keep bool 849 lbls, keep := relabel.Process(labelpb.ZLabelsToPromLabels(ts.Labels), h.options.RelabelConfigs...) 850 if !keep { 851 continue 852 } 853 ts.Labels = labelpb.ZLabelsFromPromLabels(lbls) 854 timeSeries = append(timeSeries, ts) 855 } 856 wreq.Timeseries = timeSeries 857 } 858 859 // isConflict returns whether or not the given error represents a conflict. 860 func isConflict(err error) bool { 861 if err == nil { 862 return false 863 } 864 return err == errConflict || 865 isSampleConflictErr(err) || 866 isExemplarConflictErr(err) || 867 isLabelsConflictErr(err) || 868 status.Code(err) == codes.AlreadyExists 869 } 870 871 // isSampleConflictErr returns whether or not the given error represents 872 // a sample-related conflict. 873 func isSampleConflictErr(err error) bool { 874 return err == storage.ErrDuplicateSampleForTimestamp || 875 err == storage.ErrOutOfOrderSample || 876 err == storage.ErrOutOfBounds || 877 err == storage.ErrTooOldSample 878 } 879 880 // isExemplarConflictErr returns whether or not the given error represents 881 // a exemplar-related conflict. 882 func isExemplarConflictErr(err error) bool { 883 return err == storage.ErrDuplicateExemplar || 884 err == storage.ErrOutOfOrderExemplar || 885 err == storage.ErrExemplarLabelLength 886 } 887 888 // isLabelsConflictErr returns whether or not the given error represents 889 // a labels-related conflict. 890 func isLabelsConflictErr(err error) bool { 891 return err == labelpb.ErrDuplicateLabels || 892 err == labelpb.ErrEmptyLabels || 893 err == labelpb.ErrOutOfOrderLabels 894 } 895 896 // isNotReady returns whether or not the given error represents a not ready error. 897 func isNotReady(err error) bool { 898 return err == errNotReady || 899 err == tsdb.ErrNotReady || 900 status.Code(err) == codes.Unavailable 901 } 902 903 // isUnavailable returns whether or not the given error represents an unavailable error. 904 func isUnavailable(err error) bool { 905 return err == errUnavailable || 906 status.Code(err) == codes.Unavailable 907 } 908 909 // retryState encapsulates the number of request attempt made against a peer and, 910 // next allowed time for the next attempt. 911 type retryState struct { 912 attempt float64 913 nextAllowed time.Time 914 } 915 916 type expectedErrors []*expectedError 917 918 type expectedError struct { 919 err error 920 cause func(error) bool 921 count int 922 } 923 924 func (a expectedErrors) Len() int { return len(a) } 925 func (a expectedErrors) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 926 func (a expectedErrors) Less(i, j int) bool { return a[i].count < a[j].count } 927 928 // errorSet is a set of errors. 929 type errorSet struct { 930 reasonSet map[string]struct{} 931 errs []error 932 } 933 934 // Error returns a string containing a deduplicated set of reasons. 935 func (es errorSet) Error() string { 936 if len(es.reasonSet) == 0 { 937 return "" 938 } 939 reasons := make([]string, 0, len(es.reasonSet)) 940 for reason := range es.reasonSet { 941 reasons = append(reasons, reason) 942 } 943 sort.Strings(reasons) 944 945 var buf bytes.Buffer 946 if len(reasons) > 1 { 947 fmt.Fprintf(&buf, "%d errors: ", len(es.reasonSet)) 948 } 949 950 var more bool 951 for _, reason := range reasons { 952 if more { 953 buf.WriteString("; ") 954 } 955 buf.WriteString(reason) 956 more = true 957 } 958 959 return buf.String() 960 } 961 962 // Add adds an error to the errorSet. 963 func (es *errorSet) Add(err error) { 964 if err == nil { 965 return 966 } 967 968 if len(es.errs) == 0 { 969 es.errs = []error{err} 970 } else { 971 es.errs = append(es.errs, err) 972 } 973 if es.reasonSet == nil { 974 es.reasonSet = make(map[string]struct{}) 975 } 976 977 switch addedErr := err.(type) { 978 case *replicationErrors: 979 for reason := range addedErr.reasonSet { 980 es.reasonSet[reason] = struct{}{} 981 } 982 case *writeErrors: 983 for reason := range addedErr.reasonSet { 984 es.reasonSet[reason] = struct{}{} 985 } 986 default: 987 es.reasonSet[err.Error()] = struct{}{} 988 } 989 } 990 991 // writeErrors contains all errors that have 992 // occurred during a local write of a remote-write request. 993 type writeErrors struct { 994 errorSet 995 } 996 997 // ErrOrNil returns the writeErrors instance if any 998 // errors are contained in it. 999 // Otherwise, it returns nil. 1000 func (es *writeErrors) ErrOrNil() error { 1001 if len(es.errs) == 0 { 1002 return nil 1003 } 1004 return es 1005 } 1006 1007 // Cause returns the primary cause for a write failure. 1008 // If multiple errors have occurred, Cause will prefer 1009 // recoverable over non-recoverable errors. 1010 func (es *writeErrors) Cause() error { 1011 if len(es.errs) == 0 { 1012 return nil 1013 } 1014 1015 expErrs := expectedErrors{ 1016 {err: errUnavailable, cause: isUnavailable}, 1017 {err: errNotReady, cause: isNotReady}, 1018 {err: errConflict, cause: isConflict}, 1019 } 1020 1021 var ( 1022 unknownErr error 1023 knownCause bool 1024 ) 1025 for _, werr := range es.errs { 1026 knownCause = false 1027 cause := errors.Cause(werr) 1028 for _, exp := range expErrs { 1029 if exp.cause(cause) { 1030 knownCause = true 1031 exp.count++ 1032 } 1033 } 1034 if !knownCause { 1035 unknownErr = cause 1036 } 1037 } 1038 1039 for _, exp := range expErrs { 1040 if exp.count > 0 { 1041 return exp.err 1042 } 1043 } 1044 1045 return unknownErr 1046 } 1047 1048 // replicationErrors contains errors that have happened while 1049 // replicating a time series within a remote-write request. 1050 type replicationErrors struct { 1051 errorSet 1052 threshold int 1053 } 1054 1055 // Cause extracts a sentinel error with the highest occurrence that 1056 // has happened more than the given threshold. 1057 // If no single error has occurred more than the threshold, but the 1058 // total number of errors meets the threshold, 1059 // replicationErr will return errInternal. 1060 func (es *replicationErrors) Cause() error { 1061 if len(es.errs) == 0 { 1062 return errorSet{} 1063 } 1064 1065 expErrs := expectedErrors{ 1066 {err: errConflict, cause: isConflict}, 1067 {err: errNotReady, cause: isNotReady}, 1068 {err: errUnavailable, cause: isUnavailable}, 1069 } 1070 for _, exp := range expErrs { 1071 exp.count = 0 1072 for _, err := range es.errs { 1073 if exp.cause(errors.Cause(err)) { 1074 exp.count++ 1075 } 1076 } 1077 } 1078 1079 // Determine which error occurred most. 1080 sort.Sort(sort.Reverse(expErrs)) 1081 if exp := expErrs[0]; exp.count >= es.threshold { 1082 return exp.err 1083 } 1084 1085 if len(es.errs) >= es.threshold { 1086 return errInternal 1087 } 1088 1089 return nil 1090 } 1091 1092 func newReplicationErrors(threshold, numErrors int) []*replicationErrors { 1093 errs := make([]*replicationErrors, numErrors) 1094 for i := range errs { 1095 errs[i] = &replicationErrors{threshold: threshold} 1096 } 1097 return errs 1098 } 1099 1100 func newPeerGroup(dialOpts ...grpc.DialOption) *peerGroup { 1101 return &peerGroup{ 1102 dialOpts: dialOpts, 1103 cache: map[string]storepb.WriteableStoreClient{}, 1104 m: sync.RWMutex{}, 1105 dialer: grpc.DialContext, 1106 } 1107 } 1108 1109 type peerGroup struct { 1110 dialOpts []grpc.DialOption 1111 cache map[string]storepb.WriteableStoreClient 1112 m sync.RWMutex 1113 1114 // dialer is used for testing. 1115 dialer func(ctx context.Context, target string, opts ...grpc.DialOption) (conn *grpc.ClientConn, err error) 1116 } 1117 1118 func (p *peerGroup) get(ctx context.Context, addr string) (storepb.WriteableStoreClient, error) { 1119 // use a RLock first to prevent blocking if we don't need to. 1120 p.m.RLock() 1121 c, ok := p.cache[addr] 1122 p.m.RUnlock() 1123 if ok { 1124 return c, nil 1125 } 1126 1127 p.m.Lock() 1128 defer p.m.Unlock() 1129 // Make sure that another caller hasn't created the connection since obtaining the write lock. 1130 c, ok = p.cache[addr] 1131 if ok { 1132 return c, nil 1133 } 1134 conn, err := p.dialer(ctx, addr, p.dialOpts...) 1135 if err != nil { 1136 return nil, errors.Wrap(err, "failed to dial peer") 1137 } 1138 1139 client := storepb.NewWriteableStoreClient(conn) 1140 p.cache[addr] = client 1141 return client, nil 1142 }