github.com/grafana/pyroscope@v1.18.0/pkg/usagestats/reporter.go (about)

     1  package usagestats
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"errors"
     7  	"flag"
     8  	"io"
     9  	"math"
    10  	"time"
    11  
    12  	"github.com/go-kit/log"
    13  	"github.com/go-kit/log/level"
    14  	"github.com/google/uuid"
    15  	"github.com/grafana/dskit/backoff"
    16  	"github.com/grafana/dskit/kv"
    17  	"github.com/grafana/dskit/multierror"
    18  	"github.com/grafana/dskit/services"
    19  	"github.com/prometheus/client_golang/prometheus"
    20  	"github.com/thanos-io/objstore"
    21  
    22  	"github.com/grafana/pyroscope/pkg/util/build"
    23  )
    24  
    25  const (
    26  	// File name for the cluster seed file.
    27  	ClusterSeedFileName = "pyroscope_cluster_seed.json"
    28  	// attemptNumber how many times we will try to read a corrupted cluster seed before deleting it.
    29  	attemptNumber = 4
    30  	// seedKey is the key for the cluster seed to use with the kv store.
    31  	seedKey = "usagestats_token"
    32  )
    33  
    34  var (
    35  	reportCheckInterval = time.Minute
    36  	reportInterval      = 4 * time.Hour
    37  
    38  	stabilityCheckInterval   = 5 * time.Second
    39  	stabilityMinimunRequired = 6
    40  )
    41  
    42  type Config struct {
    43  	Enabled bool `yaml:"reporting_enabled"`
    44  	Leader  bool `yaml:"-"`
    45  }
    46  
    47  // RegisterFlags adds the flags required to config this to the given FlagSet
    48  func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
    49  	f.BoolVar(&cfg.Enabled, "usage-stats.enabled", true, "Enable anonymous usage statistics collection. For more details about usage statistics, refer to https://grafana.com/docs/pyroscope/latest/configure-server/anonymous-usage-statistics-reporting/")
    50  }
    51  
    52  type Reporter struct {
    53  	logger log.Logger
    54  	bucket objstore.Bucket
    55  	reg    prometheus.Registerer
    56  
    57  	services.Service
    58  
    59  	conf       Config
    60  	kvConfig   kv.Config
    61  	cluster    ClusterSeed
    62  	lastReport time.Time
    63  }
    64  
    65  func NewReporter(config Config, kvConfig kv.Config, objectClient objstore.Bucket, logger log.Logger, reg prometheus.Registerer) (*Reporter, error) {
    66  	if !config.Enabled {
    67  		return nil, nil
    68  	}
    69  	r := &Reporter{
    70  		logger:   logger,
    71  		bucket:   objectClient,
    72  		conf:     config,
    73  		kvConfig: kvConfig,
    74  		reg:      reg,
    75  	}
    76  	r.Service = services.NewBasicService(nil, r.running, nil)
    77  	return r, nil
    78  }
    79  
    80  func (rep *Reporter) initLeader(ctx context.Context) ClusterSeed {
    81  	kvClient, err := kv.NewClient(rep.kvConfig, JSONCodec, nil, rep.logger)
    82  	if err != nil {
    83  		level.Info(rep.logger).Log("msg", "failed to create kv client", "err", err)
    84  		return ClusterSeed{}
    85  	}
    86  	// Try to become leader via the kv client
    87  	backoff := backoff.New(ctx, backoff.Config{
    88  		MinBackoff: time.Second,
    89  		MaxBackoff: time.Minute,
    90  		MaxRetries: 0,
    91  	})
    92  	for backoff.Ongoing() {
    93  		// create a new cluster seed
    94  		seed := ClusterSeed{
    95  			UID:               uuid.NewString(),
    96  			PrometheusVersion: build.GetVersion(),
    97  			CreatedAt:         time.Now(),
    98  		}
    99  		if err := kvClient.CAS(ctx, seedKey, func(in interface{}) (out interface{}, retry bool, err error) {
   100  			// The key is already set, so we don't need to do anything
   101  			if in != nil {
   102  				if kvSeed, ok := in.(*ClusterSeed); ok && kvSeed != nil && kvSeed.UID != seed.UID {
   103  					seed = *kvSeed
   104  					return nil, false, nil
   105  				}
   106  			}
   107  
   108  			return seed.Clone(), true, nil
   109  		}); err != nil {
   110  			level.Info(rep.logger).Log("msg", "failed to CAS cluster seed key", "err", err)
   111  			continue
   112  		}
   113  		// ensure stability of the cluster seed
   114  		stableSeed := ensureStableKey(ctx, kvClient, rep.logger)
   115  		seed = *stableSeed
   116  		// Fetch the remote cluster seed.
   117  		remoteSeed, err := rep.fetchSeed(ctx,
   118  			func(err error) bool {
   119  				// we only want to retry if the error is not an object not found error
   120  				return !rep.bucket.IsObjNotFoundErr(err)
   121  			})
   122  		if err != nil {
   123  			if rep.bucket.IsObjNotFoundErr(err) {
   124  				// we are the leader and we need to save the file.
   125  				if err := rep.writeSeedFile(ctx, seed); err != nil {
   126  					level.Info(rep.logger).Log("msg", "failed to CAS cluster seed key", "err", err)
   127  					backoff.Wait()
   128  					continue
   129  				}
   130  				return seed
   131  			}
   132  			backoff.Wait()
   133  			continue
   134  		}
   135  		return remoteSeed
   136  	}
   137  	return ClusterSeed{}
   138  }
   139  
   140  // ensureStableKey ensures that the cluster seed is stable for at least 30seconds.
   141  // This is required when using gossiping kv client like memberlist which will never have the same seed
   142  // but will converge eventually.
   143  func ensureStableKey(ctx context.Context, kvClient kv.Client, logger log.Logger) *ClusterSeed {
   144  	var (
   145  		previous    *ClusterSeed
   146  		stableCount int
   147  	)
   148  	for {
   149  		time.Sleep(stabilityCheckInterval)
   150  		value, err := kvClient.Get(ctx, seedKey)
   151  		if err != nil {
   152  			level.Debug(logger).Log("msg", "failed to get cluster seed key for stability check", "err", err)
   153  			continue
   154  		}
   155  		if seed, ok := value.(*ClusterSeed); ok && seed != nil {
   156  			if previous == nil {
   157  				previous = seed
   158  				continue
   159  			}
   160  			if previous.UID != seed.UID {
   161  				previous = seed
   162  				stableCount = 0
   163  				continue
   164  			}
   165  			stableCount++
   166  			if stableCount > stabilityMinimunRequired {
   167  				return seed
   168  			}
   169  		}
   170  	}
   171  }
   172  
   173  func (rep *Reporter) init(ctx context.Context) {
   174  	if rep.conf.Leader {
   175  		rep.cluster = rep.initLeader(ctx)
   176  		return
   177  	}
   178  	// follower only wait for the cluster seed to be set.
   179  	// it will try forever to fetch the cluster seed.
   180  	seed, _ := rep.fetchSeed(ctx, nil)
   181  	rep.cluster = seed
   182  }
   183  
   184  // fetchSeed fetches the cluster seed from the object store and try until it succeeds.
   185  // continueFn allow you to decide if we should continue retrying. Nil means always retry
   186  func (rep *Reporter) fetchSeed(ctx context.Context, continueFn func(err error) bool) (ClusterSeed, error) {
   187  	var (
   188  		backoff = backoff.New(ctx, backoff.Config{
   189  			MinBackoff: time.Second,
   190  			MaxBackoff: time.Minute,
   191  			MaxRetries: 0,
   192  		})
   193  		readingErr = 0
   194  	)
   195  	for backoff.Ongoing() {
   196  		seed, err := rep.readSeedFile(ctx)
   197  		if err != nil {
   198  			if !rep.bucket.IsObjNotFoundErr(err) {
   199  				readingErr++
   200  			}
   201  			level.Debug(rep.logger).Log("msg", "failed to read cluster seed file", "err", err)
   202  			if readingErr > attemptNumber {
   203  				if err := rep.bucket.Delete(ctx, ClusterSeedFileName); err != nil {
   204  					level.Error(rep.logger).Log("msg", "failed to delete corrupted cluster seed file, deleting it", "err", err)
   205  				}
   206  				readingErr = 0
   207  			}
   208  			if continueFn == nil || continueFn(err) {
   209  				backoff.Wait()
   210  				continue
   211  			}
   212  			return ClusterSeed{}, err
   213  		}
   214  		return seed, nil
   215  	}
   216  	return ClusterSeed{}, backoff.Err()
   217  }
   218  
   219  // readSeedFile reads the cluster seed file from the object store.
   220  func (rep *Reporter) readSeedFile(ctx context.Context) (ClusterSeed, error) {
   221  	reader, err := rep.bucket.Get(ctx, ClusterSeedFileName)
   222  	if err != nil {
   223  		return ClusterSeed{}, err
   224  	}
   225  	defer func() {
   226  		if err := reader.Close(); err != nil {
   227  			level.Error(rep.logger).Log("msg", "failed to close reader", "err", err)
   228  		}
   229  	}()
   230  	data, err := io.ReadAll(reader)
   231  	if err != nil {
   232  		return ClusterSeed{}, err
   233  	}
   234  	seed, err := JSONCodec.Decode(data)
   235  	if err != nil {
   236  		return ClusterSeed{}, err
   237  	}
   238  	return *(seed.(*ClusterSeed)), nil
   239  }
   240  
   241  // writeSeedFile writes the cluster seed to the object store.
   242  func (rep *Reporter) writeSeedFile(ctx context.Context, seed ClusterSeed) error {
   243  	data, err := JSONCodec.Encode(seed)
   244  	if err != nil {
   245  		return err
   246  	}
   247  	return rep.bucket.Upload(ctx, ClusterSeedFileName, bytes.NewReader(data))
   248  }
   249  
   250  // running inits the reporter seed and start sending report for every interval
   251  func (rep *Reporter) running(ctx context.Context) error {
   252  	rep.init(ctx)
   253  
   254  	if rep.cluster.UID == "" {
   255  		<-ctx.Done()
   256  		if err := ctx.Err(); !errors.Is(err, context.Canceled) {
   257  			return err
   258  		}
   259  		return nil
   260  	}
   261  	// check every minute if we should report.
   262  	ticker := time.NewTicker(reportCheckInterval)
   263  	defer ticker.Stop()
   264  
   265  	// find  when to send the next report.
   266  	next := nextReport(reportInterval, rep.cluster.CreatedAt, time.Now())
   267  	if rep.lastReport.IsZero() {
   268  		// if we never reported assumed it was the last interval.
   269  		rep.lastReport = next.Add(-reportInterval)
   270  	}
   271  	for {
   272  		select {
   273  		case <-ticker.C:
   274  			now := time.Now()
   275  			if !next.Equal(now) && now.Sub(rep.lastReport) < reportInterval {
   276  				continue
   277  			}
   278  			level.Debug(rep.logger).Log("msg", "reporting cluster stats", "date", time.Now())
   279  			if err := rep.reportUsage(ctx, next); err != nil {
   280  				level.Info(rep.logger).Log("msg", "failed to report usage", "err", err)
   281  				continue
   282  			}
   283  			rep.lastReport = next
   284  			next = next.Add(reportInterval)
   285  		case <-ctx.Done():
   286  			if err := ctx.Err(); !errors.Is(err, context.Canceled) {
   287  				return err
   288  			}
   289  			return nil
   290  		}
   291  	}
   292  }
   293  
   294  // reportUsage reports the usage to grafana.com.
   295  func (rep *Reporter) reportUsage(ctx context.Context, interval time.Time) error {
   296  	backoff := backoff.New(ctx, backoff.Config{
   297  		MinBackoff: time.Second,
   298  		MaxBackoff: 30 * time.Second,
   299  		MaxRetries: 5,
   300  	})
   301  	var errs multierror.MultiError
   302  	for backoff.Ongoing() {
   303  		if err := sendReport(ctx, rep.cluster, interval); err != nil {
   304  			level.Info(rep.logger).Log("msg", "failed to send usage report", "retries", backoff.NumRetries(), "err", err)
   305  			errs.Add(err)
   306  			backoff.Wait()
   307  			continue
   308  		}
   309  		level.Debug(rep.logger).Log("msg", "usage report sent with success")
   310  		return nil
   311  	}
   312  	return errs.Err()
   313  }
   314  
   315  // nextReport compute the next report time based on the interval.
   316  // The interval is based off the creation of the cluster seed to avoid all cluster reporting at the same time.
   317  func nextReport(interval time.Duration, createdAt, now time.Time) time.Time {
   318  	// createdAt * (x * interval ) >= now
   319  	return createdAt.Add(time.Duration(math.Ceil(float64(now.Sub(createdAt))/float64(interval))) * interval)
   320  }