github.com/grafana/pyroscope@v1.18.0/pkg/metastore/metastore.go (about)

     1  package metastore
     2  
     3  import (
     4  	"context"
     5  	"flag"
     6  	"fmt"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/go-kit/log"
    11  	"github.com/go-kit/log/level"
    12  	"github.com/grafana/dskit/grpcclient"
    13  	"github.com/grafana/dskit/services"
    14  	"github.com/prometheus/client_golang/prometheus"
    15  	"github.com/thanos-io/objstore"
    16  	"go.etcd.io/bbolt"
    17  	"google.golang.org/grpc"
    18  
    19  	metastorev1 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1"
    20  	"github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1/raft_log"
    21  	"github.com/grafana/pyroscope/pkg/metastore/compaction/compactor"
    22  	"github.com/grafana/pyroscope/pkg/metastore/compaction/scheduler"
    23  	"github.com/grafana/pyroscope/pkg/metastore/fsm"
    24  	"github.com/grafana/pyroscope/pkg/metastore/index"
    25  	"github.com/grafana/pyroscope/pkg/metastore/index/cleaner"
    26  	"github.com/grafana/pyroscope/pkg/metastore/index/cleaner/retention"
    27  	"github.com/grafana/pyroscope/pkg/metastore/index/dlq"
    28  	"github.com/grafana/pyroscope/pkg/metastore/index/tombstones"
    29  	"github.com/grafana/pyroscope/pkg/metastore/raftnode"
    30  	"github.com/grafana/pyroscope/pkg/metastore/raftnode/raftnodepb"
    31  	"github.com/grafana/pyroscope/pkg/metastore/tracing"
    32  	placement "github.com/grafana/pyroscope/pkg/segmentwriter/client/distributor/placement/adaptiveplacement"
    33  	"github.com/grafana/pyroscope/pkg/util/health"
    34  )
    35  
    36  type Config struct {
    37  	Address          string            `yaml:"address"`
    38  	GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config" doc:"description=Configures the gRPC client used to communicate with the metastore."`
    39  	MinReadyDuration time.Duration     `yaml:"min_ready_duration" category:"advanced"`
    40  	Raft             raftnode.Config   `yaml:"raft"`
    41  	FSM              fsm.Config        `yaml:",inline" category:"advanced"`
    42  	Index            index.Config      `yaml:"index" category:"advanced"`
    43  	Compactor        compactor.Config  `yaml:",inline" category:"advanced"`
    44  	Scheduler        scheduler.Config  `yaml:",inline" category:"advanced"`
    45  }
    46  
    47  func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
    48  	const prefix = "metastore."
    49  	f.StringVar(&cfg.Address, prefix+"address", "localhost:9095", "")
    50  	f.DurationVar(&cfg.MinReadyDuration, prefix+"min-ready-duration", 15*time.Second, "Minimum duration to wait after the internal readiness checks have passed but before succeeding the readiness endpoint. This is used to slowdown deployment controllers (eg. Kubernetes) after an instance is ready and before they proceed with a rolling update, to give the rest of the cluster instances enough time to receive some (DNS?) updates.")
    51  	cfg.GRPCClientConfig.RegisterFlagsWithPrefix(prefix+"grpc-client-config", f)
    52  	cfg.Raft.RegisterFlagsWithPrefix(prefix+"raft.", f)
    53  	cfg.FSM.RegisterFlagsWithPrefix(prefix, f)
    54  	cfg.Compactor.RegisterFlagsWithPrefix(prefix, f)
    55  	cfg.Scheduler.RegisterFlagsWithPrefix(prefix, f)
    56  	cfg.Index.RegisterFlagsWithPrefix(prefix+"index.", f)
    57  }
    58  
    59  func (cfg *Config) Validate() error {
    60  	if cfg.Address == "" {
    61  		return fmt.Errorf("metastore.address is required")
    62  	}
    63  	if err := cfg.GRPCClientConfig.Validate(); err != nil {
    64  		return err
    65  	}
    66  	return cfg.Raft.Validate()
    67  }
    68  
    69  type Metastore struct {
    70  	service services.Service
    71  
    72  	config    Config
    73  	overrides Overrides
    74  	logger    log.Logger
    75  	reg       prometheus.Registerer
    76  	health    health.Service
    77  
    78  	raft            *raftnode.Node
    79  	fsm             *fsm.FSM
    80  	contextRegistry *tracing.ContextRegistry
    81  	raftNodeClient  raftnodepb.RaftNodeServiceClient
    82  
    83  	bucket    objstore.Bucket
    84  	placement *placement.Manager
    85  	recovery  *dlq.Recovery
    86  	cleaner   *cleaner.Cleaner
    87  
    88  	index        *index.Index
    89  	indexHandler *IndexCommandHandler
    90  	indexService *IndexService
    91  
    92  	tombstones        *tombstones.Tombstones
    93  	compactor         *compactor.Compactor
    94  	scheduler         *scheduler.Scheduler
    95  	compactionHandler *CompactionCommandHandler
    96  	compactionService *CompactionService
    97  
    98  	leaderRead    *raftnode.StateReader[*bbolt.Tx]
    99  	followerRead  *raftnode.StateReader[*bbolt.Tx]
   100  	tenantService *TenantService
   101  	queryService  *QueryService
   102  
   103  	readyOnce  sync.Once
   104  	readySince time.Time
   105  }
   106  
   107  type Overrides interface {
   108  	retention.Overrides
   109  }
   110  
   111  func New(
   112  	config Config,
   113  	overrides Overrides,
   114  	logger log.Logger,
   115  	reg prometheus.Registerer,
   116  	healthService health.Service,
   117  	client raftnodepb.RaftNodeServiceClient,
   118  	bucket objstore.Bucket,
   119  	placementMgr *placement.Manager,
   120  ) (*Metastore, error) {
   121  	m := &Metastore{
   122  		config:          config,
   123  		overrides:       overrides,
   124  		logger:          logger,
   125  		reg:             reg,
   126  		health:          healthService,
   127  		bucket:          bucket,
   128  		placement:       placementMgr,
   129  		raftNodeClient:  client,
   130  		contextRegistry: tracing.NewContextRegistry(reg),
   131  	}
   132  
   133  	var err error
   134  	if m.fsm, err = fsm.New(m.logger, m.reg, m.config.FSM, m.contextRegistry); err != nil {
   135  		return nil, fmt.Errorf("failed to initialize store: %w", err)
   136  	}
   137  
   138  	// Initialization of the base components.
   139  	m.index = index.NewIndex(m.logger, index.NewStore(), config.Index)
   140  	m.tombstones = tombstones.NewTombstones(tombstones.NewStore(), m.reg)
   141  	m.compactor = compactor.NewCompactor(config.Compactor, compactor.NewStore(), m.tombstones, m.reg)
   142  	m.scheduler = scheduler.NewScheduler(config.Scheduler, scheduler.NewStore(), m.reg)
   143  
   144  	// FSM handlers that utilize the components.
   145  	m.indexHandler = NewIndexCommandHandler(m.logger, m.index, m.tombstones, m.compactor)
   146  	fsm.RegisterRaftCommandHandler(m.fsm,
   147  		fsm.RaftLogEntryType(raft_log.RaftCommand_RAFT_COMMAND_ADD_BLOCK_METADATA),
   148  		m.indexHandler.AddBlock)
   149  	fsm.RegisterRaftCommandHandler(m.fsm,
   150  		fsm.RaftLogEntryType(raft_log.RaftCommand_RAFT_COMMAND_TRUNCATE_INDEX),
   151  		m.indexHandler.TruncateIndex)
   152  
   153  	m.compactionHandler = NewCompactionCommandHandler(m.logger, m.index, m.compactor, m.compactor, m.scheduler, m.tombstones)
   154  	fsm.RegisterRaftCommandHandler(m.fsm,
   155  		fsm.RaftLogEntryType(raft_log.RaftCommand_RAFT_COMMAND_GET_COMPACTION_PLAN_UPDATE),
   156  		m.compactionHandler.GetCompactionPlanUpdate)
   157  	fsm.RegisterRaftCommandHandler(m.fsm,
   158  		fsm.RaftLogEntryType(raft_log.RaftCommand_RAFT_COMMAND_UPDATE_COMPACTION_PLAN),
   159  		m.compactionHandler.UpdateCompactionPlan)
   160  
   161  	m.fsm.RegisterRestorer(m.tombstones)
   162  	m.fsm.RegisterRestorer(m.compactor)
   163  	m.fsm.RegisterRestorer(m.scheduler)
   164  	m.fsm.RegisterRestorer(m.index)
   165  
   166  	// We are ready to start raft as our FSM is fully configured.
   167  	if err = m.buildRaftNode(); err != nil {
   168  		return nil, err
   169  	}
   170  
   171  	// Create the read-only interfaces to the state.
   172  	m.followerRead = m.newFollowerReader(client, m.raft, m.fsm)
   173  	m.leaderRead = m.newLeaderReader(m.raft, m.fsm)
   174  
   175  	// Services should be registered after FSM and Raft have been initialized.
   176  	// Services provide an interface to interact with the metastore components.
   177  	m.compactionService = NewCompactionService(m.logger, m.raft)
   178  	m.indexService = NewIndexService(m.logger, m.raft, m.leaderRead, m.index, m.placement)
   179  	m.tenantService = NewTenantService(m.logger, m.followerRead, m.index)
   180  	m.queryService = NewQueryService(m.logger, m.followerRead, m.index)
   181  	m.recovery = dlq.NewRecovery(logger, config.Index.Recovery, m.indexService, bucket, m.reg)
   182  	m.cleaner = cleaner.NewCleaner(m.logger, m.overrides, config.Index.Cleaner, m.indexService)
   183  
   184  	// These are the services that only run on the raft leader.
   185  	// Keep in mind that the node may not be the leader at the moment the
   186  	// service is starting, so it should be able to handle conflicts.
   187  	m.raft.RunOnLeader(m.recovery)
   188  	m.raft.RunOnLeader(m.placement)
   189  	m.raft.RunOnLeader(m.cleaner)
   190  
   191  	m.service = services.NewBasicService(m.starting, m.running, m.stopping)
   192  	return m, nil
   193  }
   194  
   195  func (m *Metastore) buildRaftNode() (err error) {
   196  	// Raft is configured to always restore the state from the latest snapshot
   197  	// (via FSM.Restore), if it is present. Otherwise, when no snapshots
   198  	// available, the state must be initialized explicitly via FSM.Init before
   199  	// we call raft.Init, which starts applying the raft log.
   200  	if m.raft, err = raftnode.NewNode(m.logger, m.config.Raft, m.reg, m.fsm, m.contextRegistry, m.raftNodeClient); err != nil {
   201  		return fmt.Errorf("failed to create raft node: %w", err)
   202  	}
   203  
   204  	// Newly created raft node is not yet initialized and does not alter our
   205  	// FSM in any way. However, it gives us access to the snapshot store, and
   206  	// we can check whether we need to initialize the state (expensive), or we
   207  	// can defer to raft snapshots. This is an optimization: we want to avoid
   208  	// restoring the state twice: once at Init, and then at Restore.
   209  	snapshots, err := m.raft.ListSnapshots()
   210  	if err != nil {
   211  		level.Error(m.logger).Log("msg", "failed to list snapshots", "err", err)
   212  		// We continue trying; in the worst case we will initialize the state
   213  		// and then restore a snapshot received from the leader.
   214  	}
   215  
   216  	if len(snapshots) == 0 {
   217  		level.Info(m.logger).Log("msg", "no state snapshots found")
   218  		// FSM won't be restored by raft, so we need to initialize it manually.
   219  		// Otherwise, raft will restore the state from a snapshot using
   220  		// fsm.Restore, which will initialize the state as well.
   221  		if err = m.fsm.Init(); err != nil {
   222  			level.Error(m.logger).Log("msg", "failed to initialize state", "err", err)
   223  			return err
   224  		}
   225  	} else {
   226  		level.Info(m.logger).Log("msg", "skipping state initialization as snapshots found")
   227  	}
   228  
   229  	if err = m.raft.Init(); err != nil {
   230  		return fmt.Errorf("failed to initialize raft: %w", err)
   231  	}
   232  
   233  	return nil
   234  }
   235  
   236  func (m *Metastore) Register(server *grpc.Server) {
   237  	metastorev1.RegisterIndexServiceServer(server, m.indexService)
   238  	metastorev1.RegisterCompactionServiceServer(server, m.compactionService)
   239  	metastorev1.RegisterMetadataQueryServiceServer(server, m.queryService)
   240  	metastorev1.RegisterTenantServiceServer(server, m.tenantService)
   241  	m.raft.Register(server)
   242  }
   243  
   244  func (m *Metastore) Service() services.Service { return m.service }
   245  
   246  func (m *Metastore) starting(context.Context) error { return nil }
   247  
   248  func (m *Metastore) stopping(_ error) error {
   249  	// We let clients observe the leadership transfer: it's their
   250  	// responsibility to connect to the new leader. We only need to
   251  	// make sure that any error returned to clients includes details
   252  	// about the raft leader, if applicable.
   253  	if err := m.raft.TransferLeadership(); err == nil {
   254  		// We were the leader and managed to transfer leadership – wait a bit
   255  		// to let the new leader settle. During this period we're still serving
   256  		// requests, but return an error with the new leader address.
   257  		level.Info(m.logger).Log("msg", "waiting for leadership transfer to complete")
   258  		time.Sleep(m.config.MinReadyDuration)
   259  	}
   260  
   261  	// Tell clients to stop sending requests to this node. There are no any
   262  	// guarantees that clients will see or obey this. Normally, we would have
   263  	// stopped the gRPC server here, but we can't: it's managed by the service
   264  	// framework. Because of that we sleep another MinReadyDuration to let new
   265  	// client to discover that the node is not serving anymore.
   266  	m.health.SetNotServing()
   267  	time.Sleep(m.config.MinReadyDuration)
   268  
   269  	m.raft.Shutdown()
   270  	m.fsm.Shutdown()
   271  	if m.contextRegistry != nil {
   272  		m.contextRegistry.Shutdown()
   273  	}
   274  	return nil
   275  }
   276  
   277  func (m *Metastore) running(ctx context.Context) error {
   278  	m.health.SetServing()
   279  	<-ctx.Done()
   280  	return nil
   281  }
   282  
   283  // CheckReady verifies if the metastore is ready to serve requests by
   284  // ensuring the node is up-to-date with the leader's commit index.
   285  func (m *Metastore) CheckReady(ctx context.Context) error {
   286  	if _, err := m.followerRead.WaitLeaderCommitIndexApplied(ctx); err != nil {
   287  		return err
   288  	}
   289  	m.readyOnce.Do(func() {
   290  		m.readySince = time.Now()
   291  	})
   292  	if w := m.config.MinReadyDuration - time.Since(m.readySince); w > 0 {
   293  		return fmt.Errorf("%v before reporting readiness", w)
   294  	}
   295  	return nil
   296  }