github.com/grafana/pyroscope@v1.18.0/pkg/metastore/index/dlq/recovery.go (about) 1 package dlq 2 3 import ( 4 "context" 5 "errors" 6 "flag" 7 "io" 8 "sync" 9 "time" 10 11 "github.com/go-kit/log" 12 "github.com/go-kit/log/level" 13 "github.com/prometheus/client_golang/prometheus" 14 "github.com/thanos-io/objstore" 15 "google.golang.org/grpc/codes" 16 "google.golang.org/grpc/status" 17 18 metastorev1 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1" 19 "github.com/grafana/pyroscope/pkg/block" 20 "github.com/grafana/pyroscope/pkg/metastore/raftnode" 21 ) 22 23 type Config struct { 24 CheckInterval time.Duration `yaml:"dlq_recovery_check_interval"` 25 } 26 27 func (c *Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) { 28 f.DurationVar(&c.CheckInterval, prefix+"dlq-recovery-check-interval", 15*time.Second, "Dead Letter Queue check interval. 0 to disable.") 29 } 30 31 type Metastore interface { 32 AddRecoveredBlock(context.Context, *metastorev1.AddBlockRequest) (*metastorev1.AddBlockResponse, error) 33 } 34 35 type Recovery struct { 36 config Config 37 logger log.Logger 38 metastore Metastore 39 bucket objstore.Bucket 40 metrics *metrics 41 42 started bool 43 cancel func() 44 m sync.Mutex 45 } 46 47 func NewRecovery(logger log.Logger, config Config, metastore Metastore, bucket objstore.Bucket, reg prometheus.Registerer) *Recovery { 48 return &Recovery{ 49 config: config, 50 logger: logger, 51 metastore: metastore, 52 bucket: bucket, 53 metrics: newMetrics(reg), 54 } 55 } 56 57 func (r *Recovery) Start() { 58 if r.config.CheckInterval == 0 { 59 return 60 } 61 r.m.Lock() 62 defer r.m.Unlock() 63 if r.started { 64 r.logger.Log("msg", "recovery already started") 65 return 66 } 67 ctx, cancel := context.WithCancel(context.Background()) 68 r.cancel = cancel 69 r.started = true 70 go r.recoverLoop(ctx) 71 r.logger.Log("msg", "recovery started") 72 } 73 74 func (r *Recovery) Stop() { 75 if r.config.CheckInterval == 0 { 76 return 77 } 78 r.m.Lock() 79 defer r.m.Unlock() 80 if !r.started { 81 r.logger.Log("msg", "recovery already stopped") 82 return 83 } 84 if r.cancel != nil { 85 r.cancel() 86 } 87 r.started = false 88 r.logger.Log("msg", "recovery stopped") 89 } 90 91 func (r *Recovery) recoverLoop(ctx context.Context) { 92 ticker := time.NewTicker(r.config.CheckInterval) 93 defer ticker.Stop() 94 for { 95 select { 96 case <-ctx.Done(): 97 return 98 case <-ticker.C: 99 r.recoverTick(ctx) 100 } 101 } 102 } 103 104 func (r *Recovery) recoverTick(ctx context.Context) { 105 err := r.bucket.Iter(ctx, block.DirNameDLQ, func(path string) error { 106 return r.recover(ctx, path) 107 }, objstore.WithRecursiveIter()) 108 if err != nil { 109 level.Error(r.logger).Log("msg", "failed to recover block metadata", "err", err) 110 } 111 } 112 113 func (r *Recovery) recover(ctx context.Context, path string) (err error) { 114 defer func() { 115 if err == nil { 116 // In case we return no error, the block is considered recovered and will be deleted. 117 if delErr := r.bucket.Delete(ctx, path); delErr != nil { 118 level.Warn(r.logger).Log("msg", "failed to delete block metadata", "err", delErr, "path", path) 119 } 120 } 121 }() 122 123 b, err := r.readObject(ctx, path) 124 switch { 125 case err == nil: 126 case errors.Is(err, context.Canceled): 127 r.metrics.recoveryAttempts.WithLabelValues("canceled").Inc() 128 return err 129 case r.bucket.IsObjNotFoundErr(err): 130 // This is somewhat opportunistic: the error is likely caused by a competing recovery 131 // process that has already recovered the block, before we've discovered that the 132 // leadership has changed. 133 r.metrics.recoveryAttempts.WithLabelValues("not_found").Inc() 134 level.Warn(r.logger).Log("msg", "block metadata not found; skipping", "path", path) 135 return nil 136 default: 137 // This is somewhat opportunistic, as we don't know if the error is transient or not. 138 // we should consider an explicit retry mechanism with backoff and a limit on the 139 // number of attempts. 140 r.metrics.recoveryAttempts.WithLabelValues("read_error").Inc() 141 level.Warn(r.logger).Log("msg", "failed to read block metadata; to be retried", "err", err, "path", path) 142 return err 143 } 144 145 var meta metastorev1.BlockMeta 146 if err = meta.UnmarshalVT(b); err != nil { 147 r.metrics.recoveryAttempts.WithLabelValues("unmarshal_error").Inc() 148 level.Error(r.logger).Log("msg", "failed to unmarshal block metadata; skipping", "err", err, "path", path) 149 return nil 150 } 151 152 switch _, err = r.metastore.AddRecoveredBlock(ctx, &metastorev1.AddBlockRequest{Block: &meta}); { 153 case err == nil: 154 r.metrics.recoveryAttempts.WithLabelValues("success").Inc() 155 level.Debug(r.logger).Log("msg", "successfully recovered block from DLQ", "block_id", meta.Id, "path", path) 156 return nil 157 case status.Code(err) == codes.InvalidArgument: 158 r.metrics.recoveryAttempts.WithLabelValues("invalid_metadata").Inc() 159 level.Error(r.logger).Log("msg", "block metadata rejected by metastore; skipping", "err", err, "block_id", meta.Id, "path", path) 160 return nil 161 case raftnode.IsRaftLeadershipError(err): 162 r.metrics.recoveryAttempts.WithLabelValues("leadership_change").Inc() 163 level.Warn(r.logger).Log("msg", "leadership change; recovery interrupted", "err", err, "path", path) 164 return err 165 default: 166 r.metrics.recoveryAttempts.WithLabelValues("metastore_error").Inc() 167 level.Error(r.logger).Log("msg", "failed to add block metadata; to be retried", "err", err, "path", path) 168 return err 169 } 170 } 171 172 func (r *Recovery) readObject(ctx context.Context, path string) ([]byte, error) { 173 rc, err := r.bucket.Get(ctx, path) 174 if err != nil { 175 return nil, err 176 } 177 defer func() { 178 _ = rc.Close() 179 }() 180 return io.ReadAll(rc) 181 }