github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/tools/blocksconvert/builder/builder.go (about)

     1  package builder
     2  
     3  import (
     4  	"context"
     5  	"flag"
     6  	"io/ioutil"
     7  	"os"
     8  	"path/filepath"
     9  	"sort"
    10  	"strings"
    11  	"time"
    12  
    13  	"github.com/go-kit/log"
    14  	"github.com/go-kit/log/level"
    15  	"github.com/grafana/dskit/backoff"
    16  	"github.com/grafana/dskit/services"
    17  	"github.com/pkg/errors"
    18  	"github.com/prometheus/client_golang/prometheus"
    19  	"github.com/prometheus/client_golang/prometheus/promauto"
    20  	"github.com/prometheus/prometheus/pkg/labels"
    21  	"github.com/thanos-io/thanos/pkg/block"
    22  	"github.com/thanos-io/thanos/pkg/block/metadata"
    23  	"github.com/thanos-io/thanos/pkg/objstore"
    24  	"golang.org/x/sync/errgroup"
    25  
    26  	"github.com/cortexproject/cortex/pkg/chunk"
    27  	"github.com/cortexproject/cortex/pkg/chunk/cache"
    28  	"github.com/cortexproject/cortex/pkg/chunk/storage"
    29  	"github.com/cortexproject/cortex/pkg/storage/bucket"
    30  	cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb"
    31  	"github.com/cortexproject/cortex/tools/blocksconvert"
    32  	"github.com/cortexproject/cortex/tools/blocksconvert/planprocessor"
    33  )
    34  
    35  // How many series are kept in the memory before sorting and writing them to the file.
    36  const defaultSeriesBatchSize = 250000
    37  
    38  type Config struct {
    39  	OutputDirectory string
    40  	Concurrency     int
    41  
    42  	ChunkCacheConfig   cache.Config
    43  	UploadBlock        bool
    44  	DeleteLocalBlock   bool
    45  	SeriesBatchSize    int
    46  	TimestampTolerance time.Duration
    47  
    48  	PlanProcessorConfig planprocessor.Config
    49  }
    50  
    51  func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
    52  	cfg.ChunkCacheConfig.RegisterFlagsWithPrefix("chunks.", "Chunks cache", f)
    53  	cfg.PlanProcessorConfig.RegisterFlags("builder", f)
    54  
    55  	f.StringVar(&cfg.OutputDirectory, "builder.output-dir", "", "Local directory used for storing temporary plan files (will be created, if missing).")
    56  	f.IntVar(&cfg.Concurrency, "builder.concurrency", 128, "Number of concurrent series processors.")
    57  	f.BoolVar(&cfg.UploadBlock, "builder.upload", true, "Upload generated blocks to storage.")
    58  	f.BoolVar(&cfg.DeleteLocalBlock, "builder.delete-local-blocks", true, "Delete local files after uploading block.")
    59  	f.IntVar(&cfg.SeriesBatchSize, "builder.series-batch-size", defaultSeriesBatchSize, "Number of series to keep in memory before batch-write to temp file. Lower to decrease memory usage during the block building.")
    60  	f.DurationVar(&cfg.TimestampTolerance, "builder.timestamp-tolerance", 0, "Adjust sample timestamps by up to this to align them to an exact number of seconds apart.")
    61  }
    62  
    63  func NewBuilder(cfg Config, scfg blocksconvert.SharedConfig, l log.Logger, reg prometheus.Registerer) (services.Service, error) {
    64  	err := scfg.SchemaConfig.Load()
    65  	if err != nil {
    66  		return nil, errors.Wrap(err, "failed to load schema")
    67  	}
    68  
    69  	bucketClient, err := scfg.GetBucket(l, reg)
    70  	if err != nil {
    71  		return nil, err
    72  	}
    73  
    74  	if cfg.OutputDirectory == "" {
    75  		return nil, errors.New("no output directory")
    76  	}
    77  	if err := os.MkdirAll(cfg.OutputDirectory, os.FileMode(0700)); err != nil {
    78  		return nil, errors.Wrap(err, "failed to create output directory")
    79  	}
    80  
    81  	b := &Builder{
    82  		cfg: cfg,
    83  
    84  		bucketClient:  bucketClient,
    85  		schemaConfig:  scfg.SchemaConfig,
    86  		storageConfig: scfg.StorageConfig,
    87  
    88  		fetchedChunks: promauto.With(reg).NewCounter(prometheus.CounterOpts{
    89  			Name: "cortex_blocksconvert_builder_fetched_chunks_total",
    90  			Help: "Fetched chunks",
    91  		}),
    92  		fetchedChunksSize: promauto.With(reg).NewCounter(prometheus.CounterOpts{
    93  			Name: "cortex_blocksconvert_builder_fetched_chunks_bytes_total",
    94  			Help: "Fetched chunks bytes",
    95  		}),
    96  		processedSeries: promauto.With(reg).NewCounter(prometheus.CounterOpts{
    97  			Name: "cortex_blocksconvert_builder_series_total",
    98  			Help: "Processed series",
    99  		}),
   100  		writtenSamples: promauto.With(reg).NewCounter(prometheus.CounterOpts{
   101  			Name: "cortex_blocksconvert_builder_written_samples_total",
   102  			Help: "Written samples",
   103  		}),
   104  		buildInProgress: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
   105  			Name: "cortex_blocksconvert_builder_in_progress",
   106  			Help: "Build in progress",
   107  		}),
   108  		chunksNotFound: promauto.With(reg).NewCounter(prometheus.CounterOpts{
   109  			Name: "cortex_blocksconvert_builder_chunks_not_found_total",
   110  			Help: "Number of chunks that were not found on the storage.",
   111  		}),
   112  		blocksSize: promauto.With(reg).NewCounter(prometheus.CounterOpts{
   113  			Name: "cortex_blocksconvert_builder_block_size_bytes_total",
   114  			Help: "Total size of blocks generated by this builder.",
   115  		}),
   116  		seriesInMemory: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
   117  			Name: "cortex_blocksconvert_builder_series_in_memory",
   118  			Help: "Number of series kept in memory at the moment. (Builder writes series to temp files in order to reduce memory usage.)",
   119  		}),
   120  	}
   121  
   122  	return planprocessor.NewService(cfg.PlanProcessorConfig, filepath.Join(cfg.OutputDirectory, "plans"), bucketClient, b.cleanupFn, b.planProcessorFactory, l, reg)
   123  }
   124  
   125  type Builder struct {
   126  	cfg Config
   127  
   128  	bucketClient  objstore.Bucket
   129  	schemaConfig  chunk.SchemaConfig
   130  	storageConfig storage.Config
   131  
   132  	fetchedChunks     prometheus.Counter
   133  	fetchedChunksSize prometheus.Counter
   134  	processedSeries   prometheus.Counter
   135  	writtenSamples    prometheus.Counter
   136  	blocksSize        prometheus.Counter
   137  
   138  	buildInProgress prometheus.Gauge
   139  	chunksNotFound  prometheus.Counter
   140  	seriesInMemory  prometheus.Gauge
   141  }
   142  
   143  func (b *Builder) cleanupFn(log log.Logger) error {
   144  	files, err := ioutil.ReadDir(b.cfg.OutputDirectory)
   145  	if err != nil {
   146  		return err
   147  	}
   148  
   149  	// Delete directories with .tmp suffix (unfinished blocks).
   150  	for _, f := range files {
   151  		if strings.HasSuffix(f.Name(), ".tmp") && f.IsDir() {
   152  			toRemove := filepath.Join(b.cfg.OutputDirectory, f.Name())
   153  
   154  			level.Info(log).Log("msg", "deleting unfinished block", "dir", toRemove)
   155  
   156  			err := os.RemoveAll(toRemove)
   157  			if err != nil {
   158  				return errors.Wrapf(err, "removing %s", toRemove)
   159  			}
   160  		}
   161  	}
   162  
   163  	return nil
   164  }
   165  
   166  func (b *Builder) planProcessorFactory(planLog log.Logger, userID string, start time.Time, end time.Time) planprocessor.PlanProcessor {
   167  	return &builderProcessor{
   168  		builder:  b,
   169  		log:      planLog,
   170  		userID:   userID,
   171  		dayStart: start,
   172  		dayEnd:   end,
   173  	}
   174  }
   175  
   176  type builderProcessor struct {
   177  	builder *Builder
   178  
   179  	log      log.Logger
   180  	userID   string
   181  	dayStart time.Time
   182  	dayEnd   time.Time
   183  }
   184  
   185  func (p *builderProcessor) ProcessPlanEntries(ctx context.Context, planEntryCh chan blocksconvert.PlanEntry) (string, error) {
   186  	p.builder.buildInProgress.Set(1)
   187  	defer p.builder.buildInProgress.Set(0)
   188  	defer p.builder.seriesInMemory.Set(0)
   189  
   190  	chunkClient, err := p.builder.createChunkClientForDay(p.dayStart)
   191  	if err != nil {
   192  		return "", errors.Wrap(err, "failed to create chunk client")
   193  	}
   194  	defer chunkClient.Stop()
   195  
   196  	fetcher, err := newFetcher(p.userID, chunkClient, p.builder.fetchedChunks, p.builder.fetchedChunksSize)
   197  	if err != nil {
   198  		return "", errors.Wrap(err, "failed to create chunk fetcher")
   199  	}
   200  
   201  	tsdbBuilder, err := newTsdbBuilder(p.builder.cfg.OutputDirectory, p.dayStart, p.dayEnd, p.builder.cfg.TimestampTolerance, p.builder.cfg.SeriesBatchSize, p.log,
   202  		p.builder.processedSeries, p.builder.writtenSamples, p.builder.seriesInMemory)
   203  	if err != nil {
   204  		return "", errors.Wrap(err, "failed to create TSDB builder")
   205  	}
   206  
   207  	g, gctx := errgroup.WithContext(ctx)
   208  	for i := 0; i < p.builder.cfg.Concurrency; i++ {
   209  		g.Go(func() error {
   210  			return fetchAndBuild(gctx, fetcher, planEntryCh, tsdbBuilder, p.log, p.builder.chunksNotFound)
   211  		})
   212  	}
   213  
   214  	if err := g.Wait(); err != nil {
   215  		return "", errors.Wrap(err, "failed to build block")
   216  	}
   217  
   218  	// Finish block.
   219  	ulid, err := tsdbBuilder.finishBlock("blocksconvert", map[string]string{
   220  		cortex_tsdb.TenantIDExternalLabel: p.userID,
   221  	})
   222  	if err != nil {
   223  		return "", errors.Wrap(err, "failed to finish block building")
   224  	}
   225  
   226  	blockDir := filepath.Join(p.builder.cfg.OutputDirectory, ulid.String())
   227  	blockSize, err := getBlockSize(blockDir)
   228  	if err != nil {
   229  		return "", errors.Wrap(err, "block size")
   230  	}
   231  
   232  	level.Info(p.log).Log("msg", "successfully built block for a plan", "ulid", ulid.String(), "size", blockSize)
   233  	p.builder.blocksSize.Add(float64(blockSize))
   234  
   235  	if p.builder.cfg.UploadBlock {
   236  		// No per-tenant config provider because the blocksconvert tool doesn't support it.
   237  		userBucket := bucket.NewUserBucketClient(p.userID, p.builder.bucketClient, nil)
   238  
   239  		err := uploadBlock(ctx, p.log, userBucket, blockDir)
   240  		if err != nil {
   241  			return "", errors.Wrap(err, "uploading block")
   242  		}
   243  
   244  		level.Info(p.log).Log("msg", "block uploaded", "ulid", ulid.String())
   245  
   246  		if p.builder.cfg.DeleteLocalBlock {
   247  			if err := os.RemoveAll(blockDir); err != nil {
   248  				level.Warn(p.log).Log("msg", "failed to delete local block", "err", err)
   249  			}
   250  		}
   251  	}
   252  
   253  	// All OK
   254  	return ulid.String(), nil
   255  }
   256  
   257  func uploadBlock(ctx context.Context, planLog log.Logger, userBucket objstore.Bucket, blockDir string) error {
   258  	boff := backoff.New(ctx, backoff.Config{
   259  		MinBackoff: 1 * time.Second,
   260  		MaxBackoff: 5 * time.Second,
   261  		MaxRetries: 5,
   262  	})
   263  
   264  	for boff.Ongoing() {
   265  		err := block.Upload(ctx, planLog, userBucket, blockDir, metadata.NoneFunc)
   266  		if err == nil {
   267  			return nil
   268  		}
   269  
   270  		level.Warn(planLog).Log("msg", "failed to upload block", "err", err)
   271  		boff.Wait()
   272  	}
   273  
   274  	return boff.Err()
   275  }
   276  
   277  func getBlockSize(dir string) (int64, error) {
   278  	size := int64(0)
   279  
   280  	err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
   281  		if err != nil {
   282  			return err
   283  		}
   284  
   285  		if !info.IsDir() {
   286  			size += info.Size()
   287  		}
   288  
   289  		// Ignore directory with temporary series files.
   290  		if info.IsDir() && info.Name() == "series" {
   291  			return filepath.SkipDir
   292  		}
   293  
   294  		return nil
   295  	})
   296  	return size, err
   297  }
   298  
   299  func fetchAndBuild(ctx context.Context, f *Fetcher, input chan blocksconvert.PlanEntry, tb *tsdbBuilder, log log.Logger, chunksNotFound prometheus.Counter) error {
   300  	b := backoff.New(ctx, backoff.Config{
   301  		MinBackoff: 1 * time.Second,
   302  		MaxBackoff: 5 * time.Second,
   303  		MaxRetries: 5,
   304  	})
   305  
   306  	for {
   307  		select {
   308  		case <-ctx.Done():
   309  			return nil
   310  
   311  		case e, ok := <-input:
   312  			if !ok {
   313  				// End of input.
   314  				return nil
   315  			}
   316  
   317  			var m labels.Labels
   318  			var cs []chunk.Chunk
   319  			var err error
   320  
   321  			// Rather than aborting entire block build due to temporary errors ("connection reset by peer", "http2: client conn not usable"),
   322  			// try to fetch chunks multiple times.
   323  			for b.Reset(); b.Ongoing(); {
   324  				m, cs, err = fetchAndBuildSingleSeries(ctx, f, e.Chunks)
   325  				if err == nil {
   326  					break
   327  				}
   328  
   329  				if b.Ongoing() {
   330  					level.Warn(log).Log("msg", "failed to fetch chunks for series", "series", e.SeriesID, "err", err, "retries", b.NumRetries()+1)
   331  					b.Wait()
   332  				}
   333  			}
   334  
   335  			if err == nil {
   336  				err = b.Err()
   337  			}
   338  			if err != nil {
   339  				return errors.Wrapf(err, "failed to fetch chunks for series %s", e.SeriesID)
   340  			}
   341  
   342  			if len(e.Chunks) > len(cs) {
   343  				chunksNotFound.Add(float64(len(e.Chunks) - len(cs)))
   344  				level.Warn(log).Log("msg", "chunks for series not found", "seriesID", e.SeriesID, "expected", len(e.Chunks), "got", len(cs))
   345  			}
   346  
   347  			if len(cs) == 0 {
   348  				continue
   349  			}
   350  
   351  			err = tb.buildSingleSeries(m, cs)
   352  			if err != nil {
   353  				return errors.Wrapf(err, "failed to build series %s", e.SeriesID)
   354  			}
   355  		}
   356  	}
   357  }
   358  
   359  func fetchAndBuildSingleSeries(ctx context.Context, fetcher *Fetcher, chunksIds []string) (labels.Labels, []chunk.Chunk, error) {
   360  	cs, err := fetcher.fetchChunks(ctx, chunksIds)
   361  	if err != nil && !errors.Is(err, chunk.ErrStorageObjectNotFound) {
   362  		return nil, nil, errors.Wrap(err, "fetching chunks")
   363  	}
   364  
   365  	if len(cs) == 0 {
   366  		return nil, nil, nil
   367  	}
   368  
   369  	m, err := normalizeLabels(cs[0].Metric)
   370  	if err != nil {
   371  		return nil, nil, errors.Wrapf(err, "chunk has invalid metrics: %v", cs[0].Metric.String())
   372  	}
   373  
   374  	// Verify that all chunks belong to the same series.
   375  	for _, c := range cs {
   376  		nm, err := normalizeLabels(c.Metric)
   377  		if err != nil {
   378  			return nil, nil, errors.Wrapf(err, "chunk has invalid metrics: %v", c.Metric.String())
   379  		}
   380  		if !labels.Equal(m, nm) {
   381  			return nil, nil, errors.Errorf("chunks for multiple metrics: %v, %v", m.String(), c.Metric.String())
   382  		}
   383  	}
   384  
   385  	return m, cs, nil
   386  }
   387  
   388  // Labels are already sorted, but there may be duplicate label names.
   389  // This method verifies sortedness, and removes duplicate label names (if they have the same value).
   390  func normalizeLabels(lbls labels.Labels) (labels.Labels, error) {
   391  	err := checkLabels(lbls)
   392  	if err == errLabelsNotSorted {
   393  		sort.Sort(lbls)
   394  		err = checkLabels(lbls)
   395  	}
   396  
   397  	if err == errDuplicateLabelsSameValue {
   398  		lbls = removeDuplicateLabels(lbls)
   399  		err = checkLabels(lbls)
   400  	}
   401  
   402  	return lbls, err
   403  }
   404  
   405  var (
   406  	errLabelsNotSorted               = errors.New("labels not sorted")
   407  	errDuplicateLabelsSameValue      = errors.New("duplicate labels, same value")
   408  	errDuplicateLabelsDifferentValue = errors.New("duplicate labels, different values")
   409  )
   410  
   411  // Returns one of errLabelsNotSorted, errDuplicateLabelsSameValue, errDuplicateLabelsDifferentValue,
   412  // or nil, if labels are fine.
   413  func checkLabels(lbls labels.Labels) error {
   414  	prevName, prevValue := "", ""
   415  
   416  	uniqueLabels := true
   417  	for _, l := range lbls {
   418  		switch {
   419  		case l.Name < prevName:
   420  			return errLabelsNotSorted
   421  		case l.Name == prevName:
   422  			if l.Value != prevValue {
   423  				return errDuplicateLabelsDifferentValue
   424  			}
   425  
   426  			uniqueLabels = false
   427  		}
   428  
   429  		prevName = l.Name
   430  		prevValue = l.Value
   431  	}
   432  
   433  	if !uniqueLabels {
   434  		return errDuplicateLabelsSameValue
   435  	}
   436  
   437  	return nil
   438  }
   439  
   440  func removeDuplicateLabels(lbls labels.Labels) labels.Labels {
   441  	prevName, prevValue := "", ""
   442  
   443  	for ix := 0; ix < len(lbls); {
   444  		l := lbls[ix]
   445  		if l.Name == prevName && l.Value == prevValue {
   446  			lbls = append(lbls[:ix], lbls[ix+1:]...)
   447  			continue
   448  		}
   449  
   450  		prevName = l.Name
   451  		prevValue = l.Value
   452  		ix++
   453  	}
   454  
   455  	return lbls
   456  }
   457  
   458  // Finds storage configuration for given day, and builds a client.
   459  func (b *Builder) createChunkClientForDay(dayStart time.Time) (chunk.Client, error) {
   460  	for ix, s := range b.schemaConfig.Configs {
   461  		if dayStart.Unix() < s.From.Unix() {
   462  			continue
   463  		}
   464  
   465  		if ix+1 < len(b.schemaConfig.Configs) && dayStart.Unix() > b.schemaConfig.Configs[ix+1].From.Unix() {
   466  			continue
   467  		}
   468  
   469  		objectStoreType := s.ObjectType
   470  		if objectStoreType == "" {
   471  			objectStoreType = s.IndexType
   472  		}
   473  		// No registerer, to avoid problems with registering same metrics multiple times.
   474  		chunks, err := storage.NewChunkClient(objectStoreType, b.storageConfig, b.schemaConfig, nil)
   475  		if err != nil {
   476  			return nil, errors.Wrap(err, "error creating object client")
   477  		}
   478  		return chunks, nil
   479  	}
   480  
   481  	return nil, errors.Errorf("no schema for day %v", dayStart.Format("2006-01-02"))
   482  }