github.com/grafana/pyroscope@v1.18.0/pkg/metastore/metastore.go (about) 1 package metastore 2 3 import ( 4 "context" 5 "flag" 6 "fmt" 7 "sync" 8 "time" 9 10 "github.com/go-kit/log" 11 "github.com/go-kit/log/level" 12 "github.com/grafana/dskit/grpcclient" 13 "github.com/grafana/dskit/services" 14 "github.com/prometheus/client_golang/prometheus" 15 "github.com/thanos-io/objstore" 16 "go.etcd.io/bbolt" 17 "google.golang.org/grpc" 18 19 metastorev1 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1" 20 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1/raft_log" 21 "github.com/grafana/pyroscope/pkg/metastore/compaction/compactor" 22 "github.com/grafana/pyroscope/pkg/metastore/compaction/scheduler" 23 "github.com/grafana/pyroscope/pkg/metastore/fsm" 24 "github.com/grafana/pyroscope/pkg/metastore/index" 25 "github.com/grafana/pyroscope/pkg/metastore/index/cleaner" 26 "github.com/grafana/pyroscope/pkg/metastore/index/cleaner/retention" 27 "github.com/grafana/pyroscope/pkg/metastore/index/dlq" 28 "github.com/grafana/pyroscope/pkg/metastore/index/tombstones" 29 "github.com/grafana/pyroscope/pkg/metastore/raftnode" 30 "github.com/grafana/pyroscope/pkg/metastore/raftnode/raftnodepb" 31 "github.com/grafana/pyroscope/pkg/metastore/tracing" 32 placement "github.com/grafana/pyroscope/pkg/segmentwriter/client/distributor/placement/adaptiveplacement" 33 "github.com/grafana/pyroscope/pkg/util/health" 34 ) 35 36 type Config struct { 37 Address string `yaml:"address"` 38 GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config" doc:"description=Configures the gRPC client used to communicate with the metastore."` 39 MinReadyDuration time.Duration `yaml:"min_ready_duration" category:"advanced"` 40 Raft raftnode.Config `yaml:"raft"` 41 FSM fsm.Config `yaml:",inline" category:"advanced"` 42 Index index.Config `yaml:"index" category:"advanced"` 43 Compactor compactor.Config `yaml:",inline" category:"advanced"` 44 Scheduler scheduler.Config `yaml:",inline" category:"advanced"` 45 } 46 47 func (cfg *Config) RegisterFlags(f *flag.FlagSet) { 48 const prefix = "metastore." 49 f.StringVar(&cfg.Address, prefix+"address", "localhost:9095", "") 50 f.DurationVar(&cfg.MinReadyDuration, prefix+"min-ready-duration", 15*time.Second, "Minimum duration to wait after the internal readiness checks have passed but before succeeding the readiness endpoint. This is used to slowdown deployment controllers (eg. Kubernetes) after an instance is ready and before they proceed with a rolling update, to give the rest of the cluster instances enough time to receive some (DNS?) updates.") 51 cfg.GRPCClientConfig.RegisterFlagsWithPrefix(prefix+"grpc-client-config", f) 52 cfg.Raft.RegisterFlagsWithPrefix(prefix+"raft.", f) 53 cfg.FSM.RegisterFlagsWithPrefix(prefix, f) 54 cfg.Compactor.RegisterFlagsWithPrefix(prefix, f) 55 cfg.Scheduler.RegisterFlagsWithPrefix(prefix, f) 56 cfg.Index.RegisterFlagsWithPrefix(prefix+"index.", f) 57 } 58 59 func (cfg *Config) Validate() error { 60 if cfg.Address == "" { 61 return fmt.Errorf("metastore.address is required") 62 } 63 if err := cfg.GRPCClientConfig.Validate(); err != nil { 64 return err 65 } 66 return cfg.Raft.Validate() 67 } 68 69 type Metastore struct { 70 service services.Service 71 72 config Config 73 overrides Overrides 74 logger log.Logger 75 reg prometheus.Registerer 76 health health.Service 77 78 raft *raftnode.Node 79 fsm *fsm.FSM 80 contextRegistry *tracing.ContextRegistry 81 raftNodeClient raftnodepb.RaftNodeServiceClient 82 83 bucket objstore.Bucket 84 placement *placement.Manager 85 recovery *dlq.Recovery 86 cleaner *cleaner.Cleaner 87 88 index *index.Index 89 indexHandler *IndexCommandHandler 90 indexService *IndexService 91 92 tombstones *tombstones.Tombstones 93 compactor *compactor.Compactor 94 scheduler *scheduler.Scheduler 95 compactionHandler *CompactionCommandHandler 96 compactionService *CompactionService 97 98 leaderRead *raftnode.StateReader[*bbolt.Tx] 99 followerRead *raftnode.StateReader[*bbolt.Tx] 100 tenantService *TenantService 101 queryService *QueryService 102 103 readyOnce sync.Once 104 readySince time.Time 105 } 106 107 type Overrides interface { 108 retention.Overrides 109 } 110 111 func New( 112 config Config, 113 overrides Overrides, 114 logger log.Logger, 115 reg prometheus.Registerer, 116 healthService health.Service, 117 client raftnodepb.RaftNodeServiceClient, 118 bucket objstore.Bucket, 119 placementMgr *placement.Manager, 120 ) (*Metastore, error) { 121 m := &Metastore{ 122 config: config, 123 overrides: overrides, 124 logger: logger, 125 reg: reg, 126 health: healthService, 127 bucket: bucket, 128 placement: placementMgr, 129 raftNodeClient: client, 130 contextRegistry: tracing.NewContextRegistry(reg), 131 } 132 133 var err error 134 if m.fsm, err = fsm.New(m.logger, m.reg, m.config.FSM, m.contextRegistry); err != nil { 135 return nil, fmt.Errorf("failed to initialize store: %w", err) 136 } 137 138 // Initialization of the base components. 139 m.index = index.NewIndex(m.logger, index.NewStore(), config.Index) 140 m.tombstones = tombstones.NewTombstones(tombstones.NewStore(), m.reg) 141 m.compactor = compactor.NewCompactor(config.Compactor, compactor.NewStore(), m.tombstones, m.reg) 142 m.scheduler = scheduler.NewScheduler(config.Scheduler, scheduler.NewStore(), m.reg) 143 144 // FSM handlers that utilize the components. 145 m.indexHandler = NewIndexCommandHandler(m.logger, m.index, m.tombstones, m.compactor) 146 fsm.RegisterRaftCommandHandler(m.fsm, 147 fsm.RaftLogEntryType(raft_log.RaftCommand_RAFT_COMMAND_ADD_BLOCK_METADATA), 148 m.indexHandler.AddBlock) 149 fsm.RegisterRaftCommandHandler(m.fsm, 150 fsm.RaftLogEntryType(raft_log.RaftCommand_RAFT_COMMAND_TRUNCATE_INDEX), 151 m.indexHandler.TruncateIndex) 152 153 m.compactionHandler = NewCompactionCommandHandler(m.logger, m.index, m.compactor, m.compactor, m.scheduler, m.tombstones) 154 fsm.RegisterRaftCommandHandler(m.fsm, 155 fsm.RaftLogEntryType(raft_log.RaftCommand_RAFT_COMMAND_GET_COMPACTION_PLAN_UPDATE), 156 m.compactionHandler.GetCompactionPlanUpdate) 157 fsm.RegisterRaftCommandHandler(m.fsm, 158 fsm.RaftLogEntryType(raft_log.RaftCommand_RAFT_COMMAND_UPDATE_COMPACTION_PLAN), 159 m.compactionHandler.UpdateCompactionPlan) 160 161 m.fsm.RegisterRestorer(m.tombstones) 162 m.fsm.RegisterRestorer(m.compactor) 163 m.fsm.RegisterRestorer(m.scheduler) 164 m.fsm.RegisterRestorer(m.index) 165 166 // We are ready to start raft as our FSM is fully configured. 167 if err = m.buildRaftNode(); err != nil { 168 return nil, err 169 } 170 171 // Create the read-only interfaces to the state. 172 m.followerRead = m.newFollowerReader(client, m.raft, m.fsm) 173 m.leaderRead = m.newLeaderReader(m.raft, m.fsm) 174 175 // Services should be registered after FSM and Raft have been initialized. 176 // Services provide an interface to interact with the metastore components. 177 m.compactionService = NewCompactionService(m.logger, m.raft) 178 m.indexService = NewIndexService(m.logger, m.raft, m.leaderRead, m.index, m.placement) 179 m.tenantService = NewTenantService(m.logger, m.followerRead, m.index) 180 m.queryService = NewQueryService(m.logger, m.followerRead, m.index) 181 m.recovery = dlq.NewRecovery(logger, config.Index.Recovery, m.indexService, bucket, m.reg) 182 m.cleaner = cleaner.NewCleaner(m.logger, m.overrides, config.Index.Cleaner, m.indexService) 183 184 // These are the services that only run on the raft leader. 185 // Keep in mind that the node may not be the leader at the moment the 186 // service is starting, so it should be able to handle conflicts. 187 m.raft.RunOnLeader(m.recovery) 188 m.raft.RunOnLeader(m.placement) 189 m.raft.RunOnLeader(m.cleaner) 190 191 m.service = services.NewBasicService(m.starting, m.running, m.stopping) 192 return m, nil 193 } 194 195 func (m *Metastore) buildRaftNode() (err error) { 196 // Raft is configured to always restore the state from the latest snapshot 197 // (via FSM.Restore), if it is present. Otherwise, when no snapshots 198 // available, the state must be initialized explicitly via FSM.Init before 199 // we call raft.Init, which starts applying the raft log. 200 if m.raft, err = raftnode.NewNode(m.logger, m.config.Raft, m.reg, m.fsm, m.contextRegistry, m.raftNodeClient); err != nil { 201 return fmt.Errorf("failed to create raft node: %w", err) 202 } 203 204 // Newly created raft node is not yet initialized and does not alter our 205 // FSM in any way. However, it gives us access to the snapshot store, and 206 // we can check whether we need to initialize the state (expensive), or we 207 // can defer to raft snapshots. This is an optimization: we want to avoid 208 // restoring the state twice: once at Init, and then at Restore. 209 snapshots, err := m.raft.ListSnapshots() 210 if err != nil { 211 level.Error(m.logger).Log("msg", "failed to list snapshots", "err", err) 212 // We continue trying; in the worst case we will initialize the state 213 // and then restore a snapshot received from the leader. 214 } 215 216 if len(snapshots) == 0 { 217 level.Info(m.logger).Log("msg", "no state snapshots found") 218 // FSM won't be restored by raft, so we need to initialize it manually. 219 // Otherwise, raft will restore the state from a snapshot using 220 // fsm.Restore, which will initialize the state as well. 221 if err = m.fsm.Init(); err != nil { 222 level.Error(m.logger).Log("msg", "failed to initialize state", "err", err) 223 return err 224 } 225 } else { 226 level.Info(m.logger).Log("msg", "skipping state initialization as snapshots found") 227 } 228 229 if err = m.raft.Init(); err != nil { 230 return fmt.Errorf("failed to initialize raft: %w", err) 231 } 232 233 return nil 234 } 235 236 func (m *Metastore) Register(server *grpc.Server) { 237 metastorev1.RegisterIndexServiceServer(server, m.indexService) 238 metastorev1.RegisterCompactionServiceServer(server, m.compactionService) 239 metastorev1.RegisterMetadataQueryServiceServer(server, m.queryService) 240 metastorev1.RegisterTenantServiceServer(server, m.tenantService) 241 m.raft.Register(server) 242 } 243 244 func (m *Metastore) Service() services.Service { return m.service } 245 246 func (m *Metastore) starting(context.Context) error { return nil } 247 248 func (m *Metastore) stopping(_ error) error { 249 // We let clients observe the leadership transfer: it's their 250 // responsibility to connect to the new leader. We only need to 251 // make sure that any error returned to clients includes details 252 // about the raft leader, if applicable. 253 if err := m.raft.TransferLeadership(); err == nil { 254 // We were the leader and managed to transfer leadership – wait a bit 255 // to let the new leader settle. During this period we're still serving 256 // requests, but return an error with the new leader address. 257 level.Info(m.logger).Log("msg", "waiting for leadership transfer to complete") 258 time.Sleep(m.config.MinReadyDuration) 259 } 260 261 // Tell clients to stop sending requests to this node. There are no any 262 // guarantees that clients will see or obey this. Normally, we would have 263 // stopped the gRPC server here, but we can't: it's managed by the service 264 // framework. Because of that we sleep another MinReadyDuration to let new 265 // client to discover that the node is not serving anymore. 266 m.health.SetNotServing() 267 time.Sleep(m.config.MinReadyDuration) 268 269 m.raft.Shutdown() 270 m.fsm.Shutdown() 271 if m.contextRegistry != nil { 272 m.contextRegistry.Shutdown() 273 } 274 return nil 275 } 276 277 func (m *Metastore) running(ctx context.Context) error { 278 m.health.SetServing() 279 <-ctx.Done() 280 return nil 281 } 282 283 // CheckReady verifies if the metastore is ready to serve requests by 284 // ensuring the node is up-to-date with the leader's commit index. 285 func (m *Metastore) CheckReady(ctx context.Context) error { 286 if _, err := m.followerRead.WaitLeaderCommitIndexApplied(ctx); err != nil { 287 return err 288 } 289 m.readyOnce.Do(func() { 290 m.readySince = time.Now() 291 }) 292 if w := m.config.MinReadyDuration - time.Since(m.readySince); w > 0 { 293 return fmt.Errorf("%v before reporting readiness", w) 294 } 295 return nil 296 }