github.com/grafana/pyroscope@v1.18.0/pkg/metastore/index/dlq/recovery.go (about)

     1  package dlq
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"flag"
     7  	"io"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/go-kit/log"
    12  	"github.com/go-kit/log/level"
    13  	"github.com/prometheus/client_golang/prometheus"
    14  	"github.com/thanos-io/objstore"
    15  	"google.golang.org/grpc/codes"
    16  	"google.golang.org/grpc/status"
    17  
    18  	metastorev1 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1"
    19  	"github.com/grafana/pyroscope/pkg/block"
    20  	"github.com/grafana/pyroscope/pkg/metastore/raftnode"
    21  )
    22  
    23  type Config struct {
    24  	CheckInterval time.Duration `yaml:"dlq_recovery_check_interval"`
    25  }
    26  
    27  func (c *Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) {
    28  	f.DurationVar(&c.CheckInterval, prefix+"dlq-recovery-check-interval", 15*time.Second, "Dead Letter Queue check interval. 0 to disable.")
    29  }
    30  
    31  type Metastore interface {
    32  	AddRecoveredBlock(context.Context, *metastorev1.AddBlockRequest) (*metastorev1.AddBlockResponse, error)
    33  }
    34  
    35  type Recovery struct {
    36  	config    Config
    37  	logger    log.Logger
    38  	metastore Metastore
    39  	bucket    objstore.Bucket
    40  	metrics   *metrics
    41  
    42  	started bool
    43  	cancel  func()
    44  	m       sync.Mutex
    45  }
    46  
    47  func NewRecovery(logger log.Logger, config Config, metastore Metastore, bucket objstore.Bucket, reg prometheus.Registerer) *Recovery {
    48  	return &Recovery{
    49  		config:    config,
    50  		logger:    logger,
    51  		metastore: metastore,
    52  		bucket:    bucket,
    53  		metrics:   newMetrics(reg),
    54  	}
    55  }
    56  
    57  func (r *Recovery) Start() {
    58  	if r.config.CheckInterval == 0 {
    59  		return
    60  	}
    61  	r.m.Lock()
    62  	defer r.m.Unlock()
    63  	if r.started {
    64  		r.logger.Log("msg", "recovery already started")
    65  		return
    66  	}
    67  	ctx, cancel := context.WithCancel(context.Background())
    68  	r.cancel = cancel
    69  	r.started = true
    70  	go r.recoverLoop(ctx)
    71  	r.logger.Log("msg", "recovery started")
    72  }
    73  
    74  func (r *Recovery) Stop() {
    75  	if r.config.CheckInterval == 0 {
    76  		return
    77  	}
    78  	r.m.Lock()
    79  	defer r.m.Unlock()
    80  	if !r.started {
    81  		r.logger.Log("msg", "recovery already stopped")
    82  		return
    83  	}
    84  	if r.cancel != nil {
    85  		r.cancel()
    86  	}
    87  	r.started = false
    88  	r.logger.Log("msg", "recovery stopped")
    89  }
    90  
    91  func (r *Recovery) recoverLoop(ctx context.Context) {
    92  	ticker := time.NewTicker(r.config.CheckInterval)
    93  	defer ticker.Stop()
    94  	for {
    95  		select {
    96  		case <-ctx.Done():
    97  			return
    98  		case <-ticker.C:
    99  			r.recoverTick(ctx)
   100  		}
   101  	}
   102  }
   103  
   104  func (r *Recovery) recoverTick(ctx context.Context) {
   105  	err := r.bucket.Iter(ctx, block.DirNameDLQ, func(path string) error {
   106  		return r.recover(ctx, path)
   107  	}, objstore.WithRecursiveIter())
   108  	if err != nil {
   109  		level.Error(r.logger).Log("msg", "failed to recover block metadata", "err", err)
   110  	}
   111  }
   112  
   113  func (r *Recovery) recover(ctx context.Context, path string) (err error) {
   114  	defer func() {
   115  		if err == nil {
   116  			// In case we return no error, the block is considered recovered and will be deleted.
   117  			if delErr := r.bucket.Delete(ctx, path); delErr != nil {
   118  				level.Warn(r.logger).Log("msg", "failed to delete block metadata", "err", delErr, "path", path)
   119  			}
   120  		}
   121  	}()
   122  
   123  	b, err := r.readObject(ctx, path)
   124  	switch {
   125  	case err == nil:
   126  	case errors.Is(err, context.Canceled):
   127  		r.metrics.recoveryAttempts.WithLabelValues("canceled").Inc()
   128  		return err
   129  	case r.bucket.IsObjNotFoundErr(err):
   130  		// This is somewhat opportunistic: the error is likely caused by a competing recovery
   131  		// process that has already recovered the block, before we've discovered that the
   132  		// leadership has changed.
   133  		r.metrics.recoveryAttempts.WithLabelValues("not_found").Inc()
   134  		level.Warn(r.logger).Log("msg", "block metadata not found; skipping", "path", path)
   135  		return nil
   136  	default:
   137  		// This is somewhat opportunistic, as we don't know if the error is transient or not.
   138  		// we should consider an explicit retry mechanism with backoff and a limit on the
   139  		// number of attempts.
   140  		r.metrics.recoveryAttempts.WithLabelValues("read_error").Inc()
   141  		level.Warn(r.logger).Log("msg", "failed to read block metadata; to be retried", "err", err, "path", path)
   142  		return err
   143  	}
   144  
   145  	var meta metastorev1.BlockMeta
   146  	if err = meta.UnmarshalVT(b); err != nil {
   147  		r.metrics.recoveryAttempts.WithLabelValues("unmarshal_error").Inc()
   148  		level.Error(r.logger).Log("msg", "failed to unmarshal block metadata; skipping", "err", err, "path", path)
   149  		return nil
   150  	}
   151  
   152  	switch _, err = r.metastore.AddRecoveredBlock(ctx, &metastorev1.AddBlockRequest{Block: &meta}); {
   153  	case err == nil:
   154  		r.metrics.recoveryAttempts.WithLabelValues("success").Inc()
   155  		level.Debug(r.logger).Log("msg", "successfully recovered block from DLQ", "block_id", meta.Id, "path", path)
   156  		return nil
   157  	case status.Code(err) == codes.InvalidArgument:
   158  		r.metrics.recoveryAttempts.WithLabelValues("invalid_metadata").Inc()
   159  		level.Error(r.logger).Log("msg", "block metadata rejected by metastore; skipping", "err", err, "block_id", meta.Id, "path", path)
   160  		return nil
   161  	case raftnode.IsRaftLeadershipError(err):
   162  		r.metrics.recoveryAttempts.WithLabelValues("leadership_change").Inc()
   163  		level.Warn(r.logger).Log("msg", "leadership change; recovery interrupted", "err", err, "path", path)
   164  		return err
   165  	default:
   166  		r.metrics.recoveryAttempts.WithLabelValues("metastore_error").Inc()
   167  		level.Error(r.logger).Log("msg", "failed to add block metadata; to be retried", "err", err, "path", path)
   168  		return err
   169  	}
   170  }
   171  
   172  func (r *Recovery) readObject(ctx context.Context, path string) ([]byte, error) {
   173  	rc, err := r.bucket.Get(ctx, path)
   174  	if err != nil {
   175  		return nil, err
   176  	}
   177  	defer func() {
   178  		_ = rc.Close()
   179  	}()
   180  	return io.ReadAll(rc)
   181  }