github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/usagestats/reporter.go (about)

     1  package usagestats
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"errors"
     7  	"flag"
     8  	"io"
     9  	"math"
    10  	"time"
    11  
    12  	"github.com/go-kit/log"
    13  	"github.com/go-kit/log/level"
    14  	"github.com/google/uuid"
    15  	"github.com/grafana/dskit/backoff"
    16  	"github.com/grafana/dskit/kv"
    17  	"github.com/grafana/dskit/multierror"
    18  	"github.com/grafana/dskit/services"
    19  	"github.com/prometheus/client_golang/prometheus"
    20  
    21  	"github.com/grafana/loki/pkg/storage/chunk/client"
    22  	"github.com/grafana/loki/pkg/util/build"
    23  )
    24  
    25  const (
    26  	// File name for the cluster seed file.
    27  	ClusterSeedFileName = "loki_cluster_seed.json"
    28  	// attemptNumber how many times we will try to read a corrupted cluster seed before deleting it.
    29  	attemptNumber = 4
    30  	// seedKey is the key for the cluster seed to use with the kv store.
    31  	seedKey = "usagestats_token"
    32  )
    33  
    34  var (
    35  	reportCheckInterval = time.Minute
    36  	reportInterval      = 4 * time.Hour
    37  
    38  	stabilityCheckInterval   = 5 * time.Second
    39  	stabilityMinimunRequired = 6
    40  )
    41  
    42  type Config struct {
    43  	Enabled bool `yaml:"reporting_enabled"`
    44  	Leader  bool `yaml:"-"`
    45  }
    46  
    47  // RegisterFlags adds the flags required to config this to the given FlagSet
    48  func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
    49  	f.BoolVar(&cfg.Enabled, "reporting.enabled", true, "Enable anonymous usage reporting.")
    50  }
    51  
    52  type Reporter struct {
    53  	logger       log.Logger
    54  	objectClient client.ObjectClient
    55  	reg          prometheus.Registerer
    56  
    57  	services.Service
    58  
    59  	conf       Config
    60  	kvConfig   kv.Config
    61  	cluster    *ClusterSeed
    62  	lastReport time.Time
    63  }
    64  
    65  func NewReporter(config Config, kvConfig kv.Config, objectClient client.ObjectClient, logger log.Logger, reg prometheus.Registerer) (*Reporter, error) {
    66  	if !config.Enabled {
    67  		return nil, nil
    68  	}
    69  	r := &Reporter{
    70  		logger:       logger,
    71  		objectClient: objectClient,
    72  		conf:         config,
    73  		kvConfig:     kvConfig,
    74  		reg:          reg,
    75  	}
    76  	r.Service = services.NewBasicService(nil, r.running, nil)
    77  	return r, nil
    78  }
    79  
    80  func (rep *Reporter) initLeader(ctx context.Context) *ClusterSeed {
    81  	kvClient, err := kv.NewClient(rep.kvConfig, JSONCodec, nil, rep.logger)
    82  	if err != nil {
    83  		level.Info(rep.logger).Log("msg", "failed to create kv client", "err", err)
    84  		return nil
    85  	}
    86  	// Try to become leader via the kv client
    87  	backoff := backoff.New(ctx, backoff.Config{
    88  		MinBackoff: time.Second,
    89  		MaxBackoff: time.Minute,
    90  		MaxRetries: 0,
    91  	})
    92  	for backoff.Ongoing() {
    93  		// create a new cluster seed
    94  		seed := ClusterSeed{
    95  			UID:               uuid.NewString(),
    96  			PrometheusVersion: build.GetVersion(),
    97  			CreatedAt:         time.Now(),
    98  		}
    99  		if err := kvClient.CAS(ctx, seedKey, func(in interface{}) (out interface{}, retry bool, err error) {
   100  			// The key is already set, so we don't need to do anything
   101  			if in != nil {
   102  				if kvSeed, ok := in.(*ClusterSeed); ok && kvSeed != nil && kvSeed.UID != seed.UID {
   103  					seed = *kvSeed
   104  					return nil, false, nil
   105  				}
   106  			}
   107  			return &seed, true, nil
   108  		}); err != nil {
   109  			level.Info(rep.logger).Log("msg", "failed to CAS cluster seed key", "err", err)
   110  			continue
   111  		}
   112  		// ensure stability of the cluster seed
   113  		stableSeed := ensureStableKey(ctx, kvClient, rep.logger)
   114  		seed = *stableSeed
   115  		// Fetch the remote cluster seed.
   116  		remoteSeed, err := rep.fetchSeed(ctx,
   117  			func(err error) bool {
   118  				// we only want to retry if the error is not an object not found error
   119  				return !rep.objectClient.IsObjectNotFoundErr(err)
   120  			})
   121  		if err != nil {
   122  			if rep.objectClient.IsObjectNotFoundErr(err) {
   123  				// we are the leader and we need to save the file.
   124  				if err := rep.writeSeedFile(ctx, seed); err != nil {
   125  					level.Info(rep.logger).Log("msg", "failed to CAS cluster seed key", "err", err)
   126  					backoff.Wait()
   127  					continue
   128  				}
   129  				return &seed
   130  			}
   131  			backoff.Wait()
   132  			continue
   133  		}
   134  		return remoteSeed
   135  	}
   136  	return nil
   137  }
   138  
   139  // ensureStableKey ensures that the cluster seed is stable for at least 30seconds.
   140  // This is required when using gossiping kv client like memberlist which will never have the same seed
   141  // but will converge eventually.
   142  func ensureStableKey(ctx context.Context, kvClient kv.Client, logger log.Logger) *ClusterSeed {
   143  	var (
   144  		previous    *ClusterSeed
   145  		stableCount int
   146  	)
   147  	for {
   148  		time.Sleep(stabilityCheckInterval)
   149  		value, err := kvClient.Get(ctx, seedKey)
   150  		if err != nil {
   151  			level.Debug(logger).Log("msg", "failed to get cluster seed key for stability check", "err", err)
   152  			continue
   153  		}
   154  		if seed, ok := value.(*ClusterSeed); ok && seed != nil {
   155  			if previous == nil {
   156  				previous = seed
   157  				continue
   158  			}
   159  			if previous.UID != seed.UID {
   160  				previous = seed
   161  				stableCount = 0
   162  				continue
   163  			}
   164  			stableCount++
   165  			if stableCount > stabilityMinimunRequired {
   166  				return seed
   167  			}
   168  		}
   169  	}
   170  }
   171  
   172  func (rep *Reporter) init(ctx context.Context) {
   173  	if rep.conf.Leader {
   174  		rep.cluster = rep.initLeader(ctx)
   175  		return
   176  	}
   177  	// follower only wait for the cluster seed to be set.
   178  	// it will try forever to fetch the cluster seed.
   179  	seed, _ := rep.fetchSeed(ctx, nil)
   180  	rep.cluster = seed
   181  }
   182  
   183  // fetchSeed fetches the cluster seed from the object store and try until it succeeds.
   184  // continueFn allow you to decide if we should continue retrying. Nil means always retry
   185  func (rep *Reporter) fetchSeed(ctx context.Context, continueFn func(err error) bool) (*ClusterSeed, error) {
   186  	var (
   187  		backoff = backoff.New(ctx, backoff.Config{
   188  			MinBackoff: time.Second,
   189  			MaxBackoff: time.Minute,
   190  			MaxRetries: 0,
   191  		})
   192  		readingErr = 0
   193  	)
   194  	for backoff.Ongoing() {
   195  		seed, err := rep.readSeedFile(ctx)
   196  		if err != nil {
   197  			if !rep.objectClient.IsObjectNotFoundErr(err) {
   198  				readingErr++
   199  			}
   200  			level.Debug(rep.logger).Log("msg", "failed to read cluster seed file", "err", err)
   201  			if readingErr > attemptNumber {
   202  				if err := rep.objectClient.DeleteObject(ctx, ClusterSeedFileName); err != nil {
   203  					level.Error(rep.logger).Log("msg", "failed to delete corrupted cluster seed file, deleting it", "err", err)
   204  				}
   205  				readingErr = 0
   206  			}
   207  			if continueFn == nil || continueFn(err) {
   208  				backoff.Wait()
   209  				continue
   210  			}
   211  			return nil, err
   212  		}
   213  		return seed, nil
   214  	}
   215  	return nil, backoff.Err()
   216  }
   217  
   218  // readSeedFile reads the cluster seed file from the object store.
   219  func (rep *Reporter) readSeedFile(ctx context.Context) (*ClusterSeed, error) {
   220  	reader, _, err := rep.objectClient.GetObject(ctx, ClusterSeedFileName)
   221  	if err != nil {
   222  		return nil, err
   223  	}
   224  	if err != nil {
   225  		return nil, err
   226  	}
   227  	defer func() {
   228  		if err := reader.Close(); err != nil {
   229  			level.Error(rep.logger).Log("msg", "failed to close reader", "err", err)
   230  		}
   231  	}()
   232  	data, err := io.ReadAll(reader)
   233  	if err != nil {
   234  		return nil, err
   235  	}
   236  	seed, err := JSONCodec.Decode(data)
   237  	if err != nil {
   238  		return nil, err
   239  	}
   240  	return seed.(*ClusterSeed), nil
   241  }
   242  
   243  // writeSeedFile writes the cluster seed to the object store.
   244  func (rep *Reporter) writeSeedFile(ctx context.Context, seed ClusterSeed) error {
   245  	data, err := JSONCodec.Encode(seed)
   246  	if err != nil {
   247  		return err
   248  	}
   249  	return rep.objectClient.PutObject(ctx, ClusterSeedFileName, bytes.NewReader(data))
   250  }
   251  
   252  // running inits the reporter seed and start sending report for every interval
   253  func (rep *Reporter) running(ctx context.Context) error {
   254  	rep.init(ctx)
   255  
   256  	if rep.cluster == nil {
   257  		<-ctx.Done()
   258  		if err := ctx.Err(); !errors.Is(err, context.Canceled) {
   259  			return err
   260  		}
   261  		return nil
   262  	}
   263  	// check every minute if we should report.
   264  	ticker := time.NewTicker(reportCheckInterval)
   265  	defer ticker.Stop()
   266  
   267  	// find  when to send the next report.
   268  	next := nextReport(reportInterval, rep.cluster.CreatedAt, time.Now())
   269  	if rep.lastReport.IsZero() {
   270  		// if we never reported assumed it was the last interval.
   271  		rep.lastReport = next.Add(-reportInterval)
   272  	}
   273  	for {
   274  		select {
   275  		case <-ticker.C:
   276  			now := time.Now()
   277  			if !next.Equal(now) && now.Sub(rep.lastReport) < reportInterval {
   278  				continue
   279  			}
   280  			level.Debug(rep.logger).Log("msg", "reporting cluster stats", "date", time.Now())
   281  			if err := rep.reportUsage(ctx, next); err != nil {
   282  				level.Info(rep.logger).Log("msg", "failed to report usage", "err", err)
   283  				continue
   284  			}
   285  			rep.lastReport = next
   286  			next = next.Add(reportInterval)
   287  		case <-ctx.Done():
   288  			if err := ctx.Err(); !errors.Is(err, context.Canceled) {
   289  				return err
   290  			}
   291  			return nil
   292  		}
   293  	}
   294  }
   295  
   296  // reportUsage reports the usage to grafana.com.
   297  func (rep *Reporter) reportUsage(ctx context.Context, interval time.Time) error {
   298  	backoff := backoff.New(ctx, backoff.Config{
   299  		MinBackoff: time.Second,
   300  		MaxBackoff: 30 * time.Second,
   301  		MaxRetries: 5,
   302  	})
   303  	var errs multierror.MultiError
   304  	for backoff.Ongoing() {
   305  		if err := sendReport(ctx, rep.cluster, interval); err != nil {
   306  			level.Info(rep.logger).Log("msg", "failed to send usage report", "retries", backoff.NumRetries(), "err", err)
   307  			errs.Add(err)
   308  			backoff.Wait()
   309  			continue
   310  		}
   311  		level.Debug(rep.logger).Log("msg", "usage report sent with success")
   312  		return nil
   313  	}
   314  	return errs.Err()
   315  }
   316  
   317  // nextReport compute the next report time based on the interval.
   318  // The interval is based off the creation of the cluster seed to avoid all cluster reporting at the same time.
   319  func nextReport(interval time.Duration, createdAt, now time.Time) time.Time {
   320  	// createdAt * (x * interval ) >= now
   321  	return createdAt.Add(time.Duration(math.Ceil(float64(now.Sub(createdAt))/float64(interval))) * interval)
   322  }