github.com/thanos-io/thanos@v0.32.5/pkg/replicate/replicator.go (about)

     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     3  
     4  package replicate
     5  
     6  import (
     7  	"context"
     8  	"math/rand"
     9  	"time"
    10  
    11  	extflag "github.com/efficientgo/tools/extkingpin"
    12  	"github.com/go-kit/log"
    13  	"github.com/go-kit/log/level"
    14  	"github.com/oklog/run"
    15  	"github.com/oklog/ulid"
    16  	"github.com/opentracing/opentracing-go"
    17  	"github.com/pkg/errors"
    18  	amlabels "github.com/prometheus/alertmanager/pkg/labels"
    19  	"github.com/prometheus/client_golang/prometheus"
    20  	"github.com/prometheus/client_golang/prometheus/promauto"
    21  	"github.com/prometheus/common/model"
    22  	"github.com/prometheus/prometheus/model/labels"
    23  
    24  	"github.com/thanos-io/objstore"
    25  	"github.com/thanos-io/objstore/client"
    26  	objstoretracing "github.com/thanos-io/objstore/tracing/opentracing"
    27  
    28  	thanosblock "github.com/thanos-io/thanos/pkg/block"
    29  	"github.com/thanos-io/thanos/pkg/compact"
    30  	"github.com/thanos-io/thanos/pkg/component"
    31  	"github.com/thanos-io/thanos/pkg/extprom"
    32  	thanosmodel "github.com/thanos-io/thanos/pkg/model"
    33  	"github.com/thanos-io/thanos/pkg/prober"
    34  	"github.com/thanos-io/thanos/pkg/runutil"
    35  	"github.com/thanos-io/thanos/pkg/server/http"
    36  )
    37  
    38  const (
    39  	// Labels for metrics.
    40  	labelSuccess = "success"
    41  	labelError   = "error"
    42  )
    43  
    44  // ParseFlagMatchers parse flag into matchers.
    45  func ParseFlagMatchers(s string) ([]*labels.Matcher, error) {
    46  	amMatchers, err := amlabels.ParseMatchers(s)
    47  	if err != nil {
    48  		return nil, err
    49  	}
    50  	matchers := make([]*labels.Matcher, 0, len(amMatchers))
    51  	for _, a := range amMatchers {
    52  		if !model.LabelName.IsValid(model.LabelName(a.Name)) {
    53  			return nil, errors.Errorf("unsupported format for label %s", a.Name)
    54  		}
    55  		matchers = append(matchers, labels.MustNewMatcher(labels.MatchType(a.Type), a.Name, a.Value))
    56  	}
    57  
    58  	return matchers, nil
    59  }
    60  
    61  // RunReplicate replicate data based on config.
    62  func RunReplicate(
    63  	g *run.Group,
    64  	logger log.Logger,
    65  	reg *prometheus.Registry,
    66  	_ opentracing.Tracer,
    67  	httpBindAddr string,
    68  	httpTLSConfig string,
    69  	httpGracePeriod time.Duration,
    70  	labelSelector labels.Selector,
    71  	resolutions []compact.ResolutionLevel,
    72  	compactions []int,
    73  	fromObjStoreConfig *extflag.PathOrContent,
    74  	toObjStoreConfig *extflag.PathOrContent,
    75  	singleRun bool,
    76  	minTime, maxTime *thanosmodel.TimeOrDurationValue,
    77  	blockIDs []ulid.ULID,
    78  	ignoreMarkedForDeletion bool,
    79  ) error {
    80  	logger = log.With(logger, "component", "replicate")
    81  
    82  	level.Debug(logger).Log("msg", "setting up http listen-group")
    83  
    84  	httpProbe := prober.NewHTTP()
    85  	statusProber := prober.Combine(
    86  		httpProbe,
    87  		prober.NewInstrumentation(component.Replicate, logger, extprom.WrapRegistererWithPrefix("thanos_", reg)),
    88  	)
    89  
    90  	s := http.New(logger, reg, component.Replicate, httpProbe,
    91  		http.WithListen(httpBindAddr),
    92  		http.WithGracePeriod(httpGracePeriod),
    93  		http.WithTLSConfig(httpTLSConfig),
    94  	)
    95  
    96  	g.Add(func() error {
    97  		level.Info(logger).Log("msg", "Listening for http service", "address", httpBindAddr)
    98  
    99  		statusProber.Healthy()
   100  
   101  		return s.ListenAndServe()
   102  	}, func(err error) {
   103  		statusProber.NotReady(err)
   104  		defer statusProber.NotHealthy(err)
   105  
   106  		s.Shutdown(err)
   107  	})
   108  
   109  	fromConfContentYaml, err := fromObjStoreConfig.Content()
   110  	if err != nil {
   111  		return err
   112  	}
   113  
   114  	if len(fromConfContentYaml) == 0 {
   115  		return errors.New("No supported bucket was configured to replicate from")
   116  	}
   117  
   118  	bkt, err := client.NewBucket(logger, fromConfContentYaml, component.Replicate.String())
   119  	if err != nil {
   120  		return err
   121  	}
   122  	fromBkt := objstoretracing.WrapWithTraces(
   123  		objstore.WrapWithMetrics(
   124  			bkt,
   125  			prometheus.WrapRegistererWithPrefix("thanos_", prometheus.WrapRegistererWith(prometheus.Labels{"replicate": "from"}, reg)),
   126  			bkt.Name(),
   127  		),
   128  	)
   129  
   130  	toConfContentYaml, err := toObjStoreConfig.Content()
   131  	if err != nil {
   132  		return err
   133  	}
   134  
   135  	if len(toConfContentYaml) == 0 {
   136  		return errors.New("No supported bucket was configured to replicate to")
   137  	}
   138  
   139  	toBkt, err := client.NewBucket(logger, toConfContentYaml, component.Replicate.String())
   140  	if err != nil {
   141  		return err
   142  	}
   143  	toBkt = objstoretracing.WrapWithTraces(
   144  		objstore.WrapWithMetrics(
   145  			toBkt,
   146  			prometheus.WrapRegistererWithPrefix("thanos_", prometheus.WrapRegistererWith(prometheus.Labels{"replicate": "to"}, reg)),
   147  			toBkt.Name(),
   148  		),
   149  	)
   150  
   151  	replicationRunCounter := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
   152  		Name: "thanos_replicate_replication_runs_total",
   153  		Help: "The number of replication runs split by success and error.",
   154  	}, []string{"result"})
   155  	replicationRunCounter.WithLabelValues(labelSuccess)
   156  	replicationRunCounter.WithLabelValues(labelError)
   157  
   158  	replicationRunDuration := promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
   159  		Name: "thanos_replicate_replication_run_duration_seconds",
   160  		Help: "The Duration of replication runs split by success and error.",
   161  	}, []string{"result"})
   162  	replicationRunDuration.WithLabelValues(labelSuccess)
   163  	replicationRunDuration.WithLabelValues(labelError)
   164  	fetcher, err := newMetaFetcher(logger, fromBkt, reg, *minTime, *maxTime, 32, ignoreMarkedForDeletion)
   165  	if err != nil {
   166  		return errors.Wrapf(err, "create meta fetcher with bucket %v", fromBkt)
   167  	}
   168  
   169  	blockFilter := NewBlockFilter(
   170  		logger,
   171  		labelSelector,
   172  		resolutions,
   173  		compactions,
   174  		blockIDs,
   175  	).Filter
   176  	metrics := newReplicationMetrics(reg)
   177  	ctx, cancel := context.WithCancel(context.Background())
   178  
   179  	replicateFn := func() error {
   180  		timestamp := time.Now()
   181  		entropy := ulid.Monotonic(rand.New(rand.NewSource(timestamp.UnixNano())), 0)
   182  
   183  		runID, err := ulid.New(ulid.Timestamp(timestamp), entropy)
   184  		if err != nil {
   185  			return errors.Wrap(err, "generate replication run-id")
   186  		}
   187  
   188  		logger := log.With(logger, "replication-run-id", runID.String())
   189  		level.Info(logger).Log("msg", "running replication attempt")
   190  
   191  		if err := newReplicationScheme(logger, metrics, blockFilter, fetcher, fromBkt, toBkt, reg).execute(ctx); err != nil {
   192  			return errors.Wrap(err, "replication execute")
   193  		}
   194  
   195  		return nil
   196  	}
   197  
   198  	g.Add(func() error {
   199  		defer runutil.CloseWithLogOnErr(logger, fromBkt, "from bucket client")
   200  		defer runutil.CloseWithLogOnErr(logger, toBkt, "to bucket client")
   201  
   202  		statusProber.Ready()
   203  		if singleRun || len(blockIDs) > 0 {
   204  			return replicateFn()
   205  		}
   206  
   207  		return runutil.Repeat(time.Minute, ctx.Done(), func() error {
   208  			start := time.Now()
   209  			if err := replicateFn(); err != nil {
   210  				level.Error(logger).Log("msg", "running replication failed", "err", err)
   211  				replicationRunCounter.WithLabelValues(labelError).Inc()
   212  				replicationRunDuration.WithLabelValues(labelError).Observe(time.Since(start).Seconds())
   213  
   214  				// No matter the error we want to repeat indefinitely.
   215  				return nil
   216  			}
   217  			replicationRunCounter.WithLabelValues(labelSuccess).Inc()
   218  			replicationRunDuration.WithLabelValues(labelSuccess).Observe(time.Since(start).Seconds())
   219  			level.Info(logger).Log("msg", "ran replication successfully")
   220  
   221  			return nil
   222  		})
   223  	}, func(error) {
   224  		cancel()
   225  	})
   226  
   227  	level.Info(logger).Log("msg", "starting replication")
   228  
   229  	return nil
   230  }
   231  
   232  func newMetaFetcher(
   233  	logger log.Logger,
   234  	fromBkt objstore.InstrumentedBucket,
   235  	reg prometheus.Registerer,
   236  	minTime,
   237  	maxTime thanosmodel.TimeOrDurationValue,
   238  	concurrency int,
   239  	ignoreMarkedForDeletion bool,
   240  ) (*thanosblock.MetaFetcher, error) {
   241  	filters := []thanosblock.MetadataFilter{
   242  		thanosblock.NewTimePartitionMetaFilter(minTime, maxTime),
   243  	}
   244  	if ignoreMarkedForDeletion {
   245  		filters = append(filters, thanosblock.NewIgnoreDeletionMarkFilter(logger, fromBkt, 0, concurrency))
   246  	}
   247  	return thanosblock.NewMetaFetcher(
   248  		logger,
   249  		concurrency,
   250  		fromBkt,
   251  		"",
   252  		reg,
   253  		filters,
   254  	)
   255  }