github.com/grafana/pyroscope@v1.18.0/pkg/segmentwriter/segment.go (about)

     1  package segmentwriter
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"crypto/rand"
     7  	"fmt"
     8  	"io"
     9  	"math"
    10  	"os"
    11  	"runtime"
    12  	"slices"
    13  	"strings"
    14  	"sync"
    15  	"time"
    16  
    17  	"github.com/go-kit/log"
    18  	"github.com/go-kit/log/level"
    19  	"github.com/google/uuid"
    20  	"github.com/grafana/dskit/backoff"
    21  	"github.com/oklog/ulid/v2"
    22  	"github.com/opentracing/opentracing-go"
    23  	"github.com/thanos-io/objstore"
    24  	"golang.org/x/exp/maps"
    25  	"golang.org/x/time/rate"
    26  
    27  	profilev1 "github.com/grafana/pyroscope/api/gen/proto/go/google/v1"
    28  	metastorev1 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1"
    29  	typesv1 "github.com/grafana/pyroscope/api/gen/proto/go/types/v1"
    30  	"github.com/grafana/pyroscope/pkg/block"
    31  	"github.com/grafana/pyroscope/pkg/block/metadata"
    32  	"github.com/grafana/pyroscope/pkg/model"
    33  	"github.com/grafana/pyroscope/pkg/model/pprofsplit"
    34  	pprofmodel "github.com/grafana/pyroscope/pkg/pprof"
    35  	"github.com/grafana/pyroscope/pkg/segmentwriter/memdb"
    36  	"github.com/grafana/pyroscope/pkg/util/retry"
    37  )
    38  
    39  type shardKey uint32
    40  
    41  type segmentsWriter struct {
    42  	config    Config
    43  	limits    Limits
    44  	logger    log.Logger
    45  	bucket    objstore.Bucket
    46  	metastore metastorev1.IndexServiceClient
    47  
    48  	shards     map[shardKey]*shard
    49  	shardsLock sync.RWMutex
    50  	pool       workerPool
    51  
    52  	ctx    context.Context
    53  	cancel context.CancelFunc
    54  
    55  	metrics             *segmentMetrics
    56  	headMetrics         *memdb.HeadMetrics
    57  	hedgedUploadLimiter *rate.Limiter
    58  }
    59  
    60  type shard struct {
    61  	wg        sync.WaitGroup
    62  	logger    log.Logger
    63  	concatBuf []byte
    64  	sw        *segmentsWriter
    65  	mu        sync.RWMutex
    66  	segment   *segment
    67  }
    68  
    69  func (sh *shard) ingest(fn func(head segmentIngest)) segmentWaitFlushed {
    70  	sh.mu.RLock()
    71  	s := sh.segment
    72  	s.inFlightProfiles.Add(1)
    73  	sh.mu.RUnlock()
    74  	defer s.inFlightProfiles.Done()
    75  	fn(s)
    76  	return s
    77  }
    78  
    79  func (sh *shard) loop(ctx context.Context) {
    80  	loopWG := new(sync.WaitGroup)
    81  	ticker := time.NewTicker(sh.sw.config.SegmentDuration)
    82  	defer func() {
    83  		ticker.Stop()
    84  		// Blocking here to make sure no asynchronous code is executed on this shard once loop exits
    85  		// This is mostly needed to fix a race in our integration tests
    86  		loopWG.Wait()
    87  	}()
    88  	for {
    89  		select {
    90  		case <-ticker.C:
    91  			sh.flushSegment(context.Background(), loopWG)
    92  		case <-ctx.Done():
    93  			sh.flushSegment(context.Background(), loopWG)
    94  			return
    95  		}
    96  	}
    97  }
    98  
    99  func (sh *shard) flushSegment(ctx context.Context, wg *sync.WaitGroup) {
   100  	sh.mu.Lock()
   101  	s := sh.segment
   102  	sh.segment = sh.sw.newSegment(sh, s.shard, sh.logger)
   103  	sh.mu.Unlock()
   104  
   105  	wg.Add(1)
   106  	go func() { // not blocking next ticks in case metastore/s3 latency is high
   107  		defer wg.Done()
   108  		t1 := time.Now()
   109  		s.inFlightProfiles.Wait()
   110  		s.debuginfo.waitInflight = time.Since(t1)
   111  
   112  		err := s.flush(ctx)
   113  		if err != nil {
   114  			_ = level.Error(sh.sw.logger).Log("msg", "failed to flush segment", "err", err)
   115  		}
   116  		if s.debuginfo.movedHeads > 0 {
   117  			_ = level.Debug(s.logger).Log("msg",
   118  				"writing segment block done",
   119  				"heads-count", len(s.datasets),
   120  				"heads-moved-count", s.debuginfo.movedHeads,
   121  				"inflight-duration", s.debuginfo.waitInflight,
   122  				"flush-heads-duration", s.debuginfo.flushHeadsDuration,
   123  				"flush-block-duration", s.debuginfo.flushBlockDuration,
   124  				"store-meta-duration", s.debuginfo.storeMetaDuration,
   125  				"total-duration", time.Since(t1))
   126  		}
   127  	}()
   128  }
   129  
   130  func newSegmentWriter(l log.Logger, metrics *segmentMetrics, hm *memdb.HeadMetrics, config Config, limits Limits, bucket objstore.Bucket, metastoreClient metastorev1.IndexServiceClient) *segmentsWriter {
   131  	sw := &segmentsWriter{
   132  		limits:      limits,
   133  		metrics:     metrics,
   134  		headMetrics: hm,
   135  		config:      config,
   136  		logger:      l,
   137  		bucket:      bucket,
   138  		shards:      make(map[shardKey]*shard),
   139  		metastore:   metastoreClient,
   140  	}
   141  	sw.hedgedUploadLimiter = rate.NewLimiter(rate.Limit(sw.config.UploadHedgeRateMax), int(sw.config.UploadHedgeRateBurst))
   142  	sw.ctx, sw.cancel = context.WithCancel(context.Background())
   143  	flushWorkers := runtime.GOMAXPROCS(-1)
   144  	if config.FlushConcurrency > 0 {
   145  		flushWorkers = int(config.FlushConcurrency)
   146  	}
   147  	sw.pool.run(max(minFlushConcurrency, flushWorkers))
   148  	return sw
   149  }
   150  
   151  func (sw *segmentsWriter) ingest(shard shardKey, fn func(head segmentIngest)) (await segmentWaitFlushed) {
   152  	sw.shardsLock.RLock()
   153  	s, ok := sw.shards[shard]
   154  	sw.shardsLock.RUnlock()
   155  	if ok {
   156  		return s.ingest(fn)
   157  	}
   158  
   159  	sw.shardsLock.Lock()
   160  	s, ok = sw.shards[shard]
   161  	if ok {
   162  		sw.shardsLock.Unlock()
   163  		return s.ingest(fn)
   164  	}
   165  
   166  	s = sw.newShard(shard)
   167  	sw.shards[shard] = s
   168  	sw.shardsLock.Unlock()
   169  	return s.ingest(fn)
   170  }
   171  
   172  func (sw *segmentsWriter) stop() {
   173  	sw.logger.Log("msg", "stopping segments writer")
   174  	sw.cancel()
   175  	sw.shardsLock.Lock()
   176  	defer sw.shardsLock.Unlock()
   177  	for _, s := range sw.shards {
   178  		s.wg.Wait()
   179  	}
   180  	sw.pool.stop()
   181  	sw.logger.Log("msg", "segments writer stopped")
   182  }
   183  
   184  func (sw *segmentsWriter) newShard(sk shardKey) *shard {
   185  	sl := log.With(sw.logger, "shard", fmt.Sprintf("%d", sk))
   186  	sh := &shard{
   187  		sw:        sw,
   188  		logger:    sl,
   189  		concatBuf: make([]byte, 4*0x1000),
   190  	}
   191  	sh.segment = sw.newSegment(sh, sk, sl)
   192  	sh.wg.Add(1)
   193  	go func() {
   194  		defer sh.wg.Done()
   195  		sh.loop(sw.ctx)
   196  	}()
   197  	return sh
   198  }
   199  
   200  func (sw *segmentsWriter) newSegment(sh *shard, sk shardKey, sl log.Logger) *segment {
   201  	id := ulid.MustNew(ulid.Timestamp(time.Now()), rand.Reader)
   202  	sshard := fmt.Sprintf("%d", sk)
   203  	s := &segment{
   204  		logger:   log.With(sl, "segment-id", id.String()),
   205  		ulid:     id,
   206  		datasets: make(map[datasetKey]*dataset),
   207  		sw:       sw,
   208  		sh:       sh,
   209  		shard:    sk,
   210  		sshard:   sshard,
   211  		doneChan: make(chan struct{}),
   212  	}
   213  	return s
   214  }
   215  
   216  func (s *segment) flush(ctx context.Context) (err error) {
   217  	span, ctx := opentracing.StartSpanFromContext(ctx, "segment.flush", opentracing.Tags{
   218  		"block_id": s.ulid.String(),
   219  		"datasets": len(s.datasets),
   220  		"shard":    s.shard,
   221  	})
   222  	defer span.Finish()
   223  
   224  	t1 := time.Now()
   225  	defer func() {
   226  		if err != nil {
   227  			s.flushErrMutex.Lock()
   228  			s.flushErr = err
   229  			s.flushErrMutex.Unlock()
   230  		}
   231  		close(s.doneChan)
   232  		s.sw.metrics.flushSegmentDuration.WithLabelValues(s.sshard).Observe(time.Since(t1).Seconds())
   233  	}()
   234  
   235  	stream := s.flushHeads(ctx)
   236  	s.debuginfo.movedHeads = len(stream.heads)
   237  	if len(stream.heads) == 0 {
   238  		return nil
   239  	}
   240  
   241  	// TODO(kolesnikovae): Use buffer pool for blockData.
   242  	blockData, blockMeta, err := s.flushBlock(stream)
   243  	if err != nil {
   244  		return fmt.Errorf("failed to flush block %s: %w", s.ulid.String(), err)
   245  	}
   246  	if err = s.sw.uploadBlock(ctx, blockData, blockMeta, s); err != nil {
   247  		return fmt.Errorf("failed to upload block %s: %w", s.ulid.String(), err)
   248  	}
   249  	if err = s.sw.storeMetadata(ctx, blockMeta, s); err != nil {
   250  		return fmt.Errorf("failed to store meta %s: %w", s.ulid.String(), err)
   251  	}
   252  
   253  	return nil
   254  }
   255  
   256  func (s *segment) flushBlock(stream flushStream) ([]byte, *metastorev1.BlockMeta, error) {
   257  	start := time.Now()
   258  	hostname, _ := os.Hostname()
   259  
   260  	stringTable := metadata.NewStringTable()
   261  	meta := &metastorev1.BlockMeta{
   262  		FormatVersion:   1,
   263  		Id:              s.ulid.String(),
   264  		Tenant:          0,
   265  		Shard:           uint32(s.shard),
   266  		CompactionLevel: 0,
   267  		CreatedBy:       stringTable.Put(hostname),
   268  		MinTime:         math.MaxInt64,
   269  		MaxTime:         0,
   270  		Size:            0,
   271  		Datasets:        make([]*metastorev1.Dataset, 0, len(stream.heads)),
   272  	}
   273  
   274  	blockFile := bytes.NewBuffer(nil)
   275  
   276  	w := &writerOffset{Writer: blockFile}
   277  	for stream.Next() {
   278  		f := stream.At()
   279  		// TODO(kolesnikovae): Build dataset index for the tenant.
   280  		//   Note that the heads are flushed concurrently, so we cannot build
   281  		//   during head flush. I'd prefer to delegate it to the head itself:
   282  		//      WriteDatasetIndex(w *memindex.Writer, idx uint32)
   283  		//   Tenant datasets follow sequentially; when all tenant datasets
   284  		//   are flushed, we can build the index and create a metadata
   285  		//   entry for it.
   286  		ds := concatSegmentHead(f, w, stringTable)
   287  		meta.MinTime = min(meta.MinTime, ds.MinTime)
   288  		meta.MaxTime = max(meta.MaxTime, ds.MaxTime)
   289  		meta.Datasets = append(meta.Datasets, ds)
   290  		s.sw.metrics.headSizeBytes.WithLabelValues(s.sshard, f.dataset.key.tenant).Observe(float64(ds.Size))
   291  	}
   292  
   293  	meta.StringTable = stringTable.Strings
   294  	meta.MetadataOffset = uint64(w.offset)
   295  	if err := metadata.Encode(w, meta); err != nil {
   296  		return nil, nil, fmt.Errorf("failed to encode metadata: %w", err)
   297  	}
   298  	meta.Size = uint64(w.offset)
   299  	s.debuginfo.flushBlockDuration = time.Since(start)
   300  	return blockFile.Bytes(), meta, nil
   301  }
   302  
   303  type writerOffset struct {
   304  	io.Writer
   305  	offset int64
   306  }
   307  
   308  func (w *writerOffset) Write(p []byte) (n int, err error) {
   309  	n, err = w.Writer.Write(p)
   310  	w.offset += int64(n)
   311  	return n, err
   312  }
   313  
   314  func concatSegmentHead(f *datasetFlush, w *writerOffset, s *metadata.StringTable) *metastorev1.Dataset {
   315  	tenantServiceOffset := w.offset
   316  
   317  	ptypes := f.flushed.Meta.ProfileTypeNames
   318  
   319  	offsets := []uint64{0, 0, 0}
   320  
   321  	offsets[0] = uint64(w.offset)
   322  	_, _ = w.Write(f.flushed.Profiles)
   323  
   324  	offsets[1] = uint64(w.offset)
   325  	_, _ = w.Write(f.flushed.Index)
   326  
   327  	offsets[2] = uint64(w.offset)
   328  	_, _ = w.Write(f.flushed.Symbols)
   329  
   330  	tenantServiceSize := w.offset - tenantServiceOffset
   331  
   332  	ds := &metastorev1.Dataset{
   333  		Tenant:  s.Put(f.dataset.key.tenant),
   334  		Name:    s.Put(f.dataset.key.service),
   335  		MinTime: f.flushed.Meta.MinTimeNanos / 1e6,
   336  		MaxTime: f.flushed.Meta.MaxTimeNanos / 1e6,
   337  		Size:    uint64(tenantServiceSize),
   338  		//  - 0: profiles.parquet
   339  		//  - 1: index.tsdb
   340  		//  - 2: symbols.symdb
   341  		TableOfContents: offsets,
   342  		Labels:          nil,
   343  	}
   344  
   345  	lb := metadata.NewLabelBuilder(s)
   346  	for _, profileType := range ptypes {
   347  		lb.WithLabelSet(model.LabelNameServiceName, f.dataset.key.service, model.LabelNameProfileType, profileType)
   348  	}
   349  
   350  	if f.flushed.Unsymbolized {
   351  		lb.WithLabelSet(model.LabelNameServiceName, f.dataset.key.service, metadata.LabelNameUnsymbolized, "true")
   352  	}
   353  
   354  	// Other optional labels:
   355  	// lb.WithLabelSet("label_name", "label_value", ...)
   356  	ds.Labels = lb.Build()
   357  
   358  	return ds
   359  }
   360  
   361  func (s *segment) flushHeads(ctx context.Context) flushStream {
   362  	heads := maps.Values(s.datasets)
   363  	slices.SortFunc(heads, func(a, b *dataset) int {
   364  		return a.key.compare(b.key)
   365  	})
   366  
   367  	stream := make([]*datasetFlush, len(heads))
   368  	for i := range heads {
   369  		f := &datasetFlush{
   370  			dataset: heads[i],
   371  			done:    make(chan struct{}),
   372  		}
   373  		stream[i] = f
   374  		s.sw.pool.do(func() {
   375  			defer close(f.done)
   376  			flushed, err := s.flushDataset(ctx, f.dataset)
   377  			if err != nil {
   378  				level.Error(s.logger).Log("msg", "failed to flush head", "err", err)
   379  				return
   380  			}
   381  			if flushed == nil {
   382  				level.Debug(s.logger).Log("msg", "skipping nil head")
   383  				return
   384  			}
   385  			if flushed.Meta.NumSamples == 0 {
   386  				level.Debug(s.logger).Log("msg", "skipping empty head")
   387  				return
   388  			}
   389  			f.flushed = flushed
   390  		})
   391  	}
   392  
   393  	return flushStream{heads: stream}
   394  }
   395  
   396  type flushStream struct {
   397  	heads []*datasetFlush
   398  	cur   *datasetFlush
   399  	n     int
   400  }
   401  
   402  func (s *flushStream) At() *datasetFlush { return s.cur }
   403  
   404  func (s *flushStream) Next() bool {
   405  	for s.n < len(s.heads) {
   406  		f := s.heads[s.n]
   407  		s.n++
   408  		<-f.done
   409  		if f.flushed != nil {
   410  			s.cur = f
   411  			return true
   412  		}
   413  	}
   414  	return false
   415  }
   416  
   417  func (s *segment) flushDataset(ctx context.Context, e *dataset) (*memdb.FlushedHead, error) {
   418  	th := time.Now()
   419  	flushed, err := e.head.Flush(ctx)
   420  	if err != nil {
   421  		s.sw.metrics.flushServiceHeadDuration.WithLabelValues(s.sshard, e.key.tenant).Observe(time.Since(th).Seconds())
   422  		s.sw.metrics.flushServiceHeadError.WithLabelValues(s.sshard, e.key.tenant).Inc()
   423  		return nil, fmt.Errorf("failed to flush head : %w", err)
   424  	}
   425  	s.sw.metrics.flushServiceHeadDuration.WithLabelValues(s.sshard, e.key.tenant).Observe(time.Since(th).Seconds())
   426  	level.Debug(s.logger).Log(
   427  		"msg", "flushed head",
   428  		"tenant", e.key.tenant,
   429  		"service", e.key.service,
   430  		"profiles", flushed.Meta.NumProfiles,
   431  		"profiletypes", fmt.Sprintf("%v", flushed.Meta.ProfileTypeNames),
   432  		"mintime", flushed.Meta.MinTimeNanos,
   433  		"maxtime", flushed.Meta.MaxTimeNanos,
   434  		"head-flush-duration", time.Since(th).String(),
   435  	)
   436  	return flushed, nil
   437  }
   438  
   439  type datasetKey struct {
   440  	tenant  string
   441  	service string
   442  }
   443  
   444  func (k datasetKey) compare(x datasetKey) int {
   445  	if k.tenant != x.tenant {
   446  		return strings.Compare(k.tenant, x.tenant)
   447  	}
   448  	return strings.Compare(k.service, x.service)
   449  }
   450  
   451  type dataset struct {
   452  	key  datasetKey
   453  	sw   *segmentsWriter
   454  	once sync.Once
   455  	head *memdb.Head
   456  }
   457  
   458  func newDataset(k datasetKey, sw *segmentsWriter) *dataset { return &dataset{key: k, sw: sw} }
   459  
   460  func (d *dataset) initHead() *memdb.Head {
   461  	d.once.Do(func() {
   462  		d.head = memdb.NewHead(d.sw.headMetrics)
   463  	})
   464  	return d.head
   465  }
   466  
   467  type datasetFlush struct {
   468  	dataset *dataset
   469  	flushed *memdb.FlushedHead
   470  	done    chan struct{}
   471  }
   472  
   473  type segment struct {
   474  	ulid             ulid.ULID
   475  	shard            shardKey
   476  	sshard           string
   477  	inFlightProfiles sync.WaitGroup
   478  
   479  	datasetsLock sync.Mutex
   480  	datasets     map[datasetKey]*dataset
   481  
   482  	logger log.Logger
   483  	sw     *segmentsWriter
   484  
   485  	// TODO(kolesnikovae): Revisit.
   486  	doneChan      chan struct{}
   487  	flushErr      error
   488  	flushErrMutex sync.Mutex
   489  
   490  	debuginfo struct {
   491  		movedHeads         int
   492  		waitInflight       time.Duration
   493  		flushHeadsDuration time.Duration
   494  		flushBlockDuration time.Duration
   495  		storeMetaDuration  time.Duration
   496  	}
   497  
   498  	// TODO(kolesnikovae): Naming.
   499  	sh *shard
   500  }
   501  
   502  type segmentIngest interface {
   503  	ingest(tenantID string, p *profilev1.Profile, id uuid.UUID, labels []*typesv1.LabelPair, annotations []*typesv1.ProfileAnnotation)
   504  }
   505  
   506  type segmentWaitFlushed interface {
   507  	waitFlushed(ctx context.Context) error
   508  }
   509  
   510  func (s *segment) waitFlushed(ctx context.Context) error {
   511  	select {
   512  	case <-ctx.Done():
   513  		return fmt.Errorf("waitFlushed: %s %w", s.ulid.String(), ctx.Err())
   514  	case <-s.doneChan:
   515  		s.flushErrMutex.Lock()
   516  		res := s.flushErr
   517  		s.flushErrMutex.Unlock()
   518  		return res
   519  	}
   520  }
   521  
   522  func (s *segment) ingest(tenantID string, p *profilev1.Profile, id uuid.UUID, labels []*typesv1.LabelPair, annotations []*typesv1.ProfileAnnotation) {
   523  	// TODO(kolesnikovae): Refactor: profile split should be moved inside the
   524  	//   dataset.Ingest: we want to do it together with / instead of creation
   525  	//   of the internal representation (InMemoryProfile).
   526  	//   symbols.WriteProfileSymbols should be replaced with something more
   527  	//   suitable (see comment) – we want to avoid allocating intermediate
   528  	//   objects that are used only temporarily.
   529  	//   Many sample series refer to same symbols, so we can avoid extra
   530  	//   processing and index symbols just once: at this point we know that
   531  	//   all samples are to be stored, and all the referred symbols need to
   532  	//   be indexed. This will require quite a bit of refactoring, but it's
   533  	//   worth it.
   534  	serviceName := model.Labels(labels).Get(model.LabelNameServiceName)
   535  	ds := s.datasetForIngest(datasetKey{tenant: tenantID, service: serviceName})
   536  	appender := &sampleAppender{dataset: ds.initHead(), profile: p, id: id, annotations: annotations}
   537  	// Relabeling rules cannot be applied here: it should be done before the
   538  	// ingestion, in distributors. Otherwise, it may change the distribution
   539  	// key, including the "service_name" label, which we use to determine the
   540  	// profile target dataset.
   541  	// TODO: Replace with pprof.GroupSamples
   542  	_ = pprofsplit.VisitSampleSeries(p, labels, nil, appender)
   543  	s.sw.metrics.segmentIngestBytes.WithLabelValues(s.sshard, tenantID).Observe(float64(p.SizeVT()))
   544  }
   545  
   546  type sampleAppender struct {
   547  	id          uuid.UUID
   548  	dataset     *memdb.Head
   549  	profile     *profilev1.Profile
   550  	exporter    *pprofmodel.SampleExporter
   551  	annotations []*typesv1.ProfileAnnotation
   552  }
   553  
   554  func (v *sampleAppender) VisitProfile(labels model.Labels) {
   555  	v.dataset.Ingest(v.profile, v.id, labels, v.annotations)
   556  }
   557  
   558  func (v *sampleAppender) VisitSampleSeries(labels model.Labels, samples []*profilev1.Sample) {
   559  	if v.exporter == nil {
   560  		v.exporter = pprofmodel.NewSampleExporter(v.profile)
   561  	}
   562  	var n profilev1.Profile
   563  	v.exporter.ExportSamples(&n, samples)
   564  	v.dataset.Ingest(&n, v.id, labels, v.annotations)
   565  }
   566  
   567  func (v *sampleAppender) ValidateLabels(labels model.Labels) (model.Labels, error) {
   568  	return labels, nil
   569  }
   570  
   571  func (v *sampleAppender) Discarded(_, _ int) {}
   572  
   573  func (s *segment) datasetForIngest(k datasetKey) *dataset {
   574  	s.datasetsLock.Lock()
   575  	ds, ok := s.datasets[k]
   576  	if !ok {
   577  		ds = newDataset(k, s.sw)
   578  		s.datasets[k] = ds
   579  	}
   580  	s.datasetsLock.Unlock()
   581  	return ds
   582  }
   583  
   584  func (sw *segmentsWriter) uploadBlock(ctx context.Context, blockData []byte, meta *metastorev1.BlockMeta, s *segment) error {
   585  	uploadStart := time.Now()
   586  	var err error
   587  	defer func() {
   588  		sw.metrics.segmentUploadDuration.
   589  			WithLabelValues(statusLabelValue(err)).
   590  			Observe(time.Since(uploadStart).Seconds())
   591  	}()
   592  
   593  	path := block.ObjectPath(meta)
   594  	sw.metrics.segmentSizeBytes.
   595  		WithLabelValues(s.sshard).
   596  		Observe(float64(len(blockData)))
   597  
   598  	if sw.config.UploadTimeout > 0 {
   599  		var cancel context.CancelFunc
   600  		ctx, cancel = context.WithTimeout(ctx, sw.config.UploadTimeout)
   601  		defer cancel()
   602  	}
   603  
   604  	// To mitigate tail latency issues, we use a hedged upload strategy:
   605  	// if the request is not completed within a certain time, we trigger
   606  	// a second upload attempt. Upload errors are retried explicitly and
   607  	// are included into the call duration.
   608  	hedgedUpload := retry.Hedged[any]{
   609  		Trigger: time.After(sw.config.UploadHedgeAfter),
   610  		Call: func(ctx context.Context, hedge bool) (any, error) {
   611  			retryConfig := backoff.Config{
   612  				MinBackoff: sw.config.UploadMinBackoff,
   613  				MaxBackoff: sw.config.UploadMaxBackoff,
   614  				MaxRetries: sw.config.UploadMaxRetries,
   615  			}
   616  			var attemptErr error
   617  			if hedge {
   618  				if limitErr := sw.hedgedUploadLimiter.Wait(ctx); limitErr != nil {
   619  					return nil, limitErr
   620  				}
   621  				// Hedged requests are not retried.
   622  				retryConfig.MaxRetries = 0
   623  				attemptStart := time.Now()
   624  				defer func() {
   625  					sw.metrics.segmentHedgedUploadDuration.
   626  						WithLabelValues(statusLabelValue(attemptErr)).
   627  						Observe(time.Since(attemptStart).Seconds())
   628  				}()
   629  			}
   630  			// Retry on all errors.
   631  			retries := backoff.New(ctx, retryConfig)
   632  			for retries.Ongoing() {
   633  				if attemptErr = sw.bucket.Upload(ctx, path, bytes.NewReader(blockData)); attemptErr == nil {
   634  					break
   635  				}
   636  				retries.Wait()
   637  			}
   638  			return nil, attemptErr
   639  		},
   640  	}
   641  
   642  	if _, err = hedgedUpload.Do(ctx); err != nil {
   643  		return err
   644  	}
   645  
   646  	level.Debug(sw.logger).Log("msg", "uploaded block", "path", path, "upload_duration", time.Since(uploadStart))
   647  	return nil
   648  }
   649  
   650  func (sw *segmentsWriter) storeMetadata(ctx context.Context, meta *metastorev1.BlockMeta, s *segment) error {
   651  	start := time.Now()
   652  	var err error
   653  	defer func() {
   654  		sw.metrics.storeMetadataDuration.
   655  			WithLabelValues(statusLabelValue(err)).
   656  			Observe(time.Since(start).Seconds())
   657  		s.debuginfo.storeMetaDuration = time.Since(start)
   658  	}()
   659  
   660  	mdCtx := ctx
   661  	if sw.config.MetadataUpdateTimeout > 0 {
   662  		var cancel context.CancelFunc
   663  		mdCtx, cancel = context.WithTimeout(mdCtx, sw.config.MetadataUpdateTimeout)
   664  		defer cancel()
   665  	}
   666  
   667  	if _, err = sw.metastore.AddBlock(mdCtx, &metastorev1.AddBlockRequest{Block: meta}); err == nil {
   668  		return nil
   669  	}
   670  
   671  	level.Error(s.logger).Log("msg", "failed to store meta in metastore", "err", err)
   672  	if !sw.config.MetadataDLQEnabled {
   673  		return err
   674  	}
   675  
   676  	defer func() {
   677  		sw.metrics.storeMetadataDLQ.WithLabelValues(statusLabelValue(err)).Inc()
   678  	}()
   679  
   680  	if err = s.sw.storeMetadataDLQ(ctx, meta); err == nil {
   681  		level.Debug(s.logger).Log("msg", "successfully wrote block metadata to DLQ", "block_id", meta.Id)
   682  		return nil
   683  	}
   684  
   685  	level.Error(s.logger).Log("msg", "metastore fallback failed", "err", err)
   686  	return err
   687  }
   688  
   689  func (sw *segmentsWriter) storeMetadataDLQ(ctx context.Context, meta *metastorev1.BlockMeta) error {
   690  	metadataBytes, err := meta.MarshalVT()
   691  	if err != nil {
   692  		return err
   693  	}
   694  	return sw.bucket.Upload(ctx, block.MetadataDLQObjectPath(meta), bytes.NewReader(metadataBytes))
   695  }
   696  
   697  type workerPool struct {
   698  	workers sync.WaitGroup
   699  	jobs    chan func()
   700  }
   701  
   702  func (p *workerPool) run(n int) {
   703  	if p.jobs != nil {
   704  		return
   705  	}
   706  	p.jobs = make(chan func())
   707  	p.workers.Add(n)
   708  	for i := 0; i < n; i++ {
   709  		go func() {
   710  			defer p.workers.Done()
   711  			for job := range p.jobs {
   712  				job()
   713  			}
   714  		}()
   715  	}
   716  }
   717  
   718  // do must not be called after stop.
   719  func (p *workerPool) do(job func()) {
   720  	p.jobs <- job
   721  }
   722  
   723  func (p *workerPool) stop() {
   724  	close(p.jobs)
   725  	p.workers.Wait()
   726  }