github.com/grafana/pyroscope@v1.18.0/pkg/metastore/fsm/fsm.go (about)

     1  package fsm
     2  
     3  import (
     4  	"context"
     5  	"encoding/binary"
     6  	"flag"
     7  	"fmt"
     8  	"io"
     9  	"strconv"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/go-kit/log"
    14  	"github.com/go-kit/log/level"
    15  	"github.com/hashicorp/raft"
    16  	"github.com/opentracing/opentracing-go"
    17  	"github.com/prometheus/client_golang/prometheus"
    18  	"go.etcd.io/bbolt"
    19  	"go.etcd.io/bbolt/errors"
    20  	"golang.org/x/sync/errgroup"
    21  	"google.golang.org/protobuf/proto"
    22  
    23  	"github.com/grafana/pyroscope/pkg/metastore/tracing"
    24  )
    25  
    26  type ContextRegistry interface {
    27  	Retrieve(id string) (context.Context, bool)
    28  	Delete(id string)
    29  	Size() int
    30  }
    31  
    32  // RaftHandler is a function that processes a Raft command.
    33  // The implementation MUST be idempotent.
    34  // The context parameter is used for tracing purposes and is only available on the leader.
    35  type RaftHandler[Req, Resp proto.Message] func(context.Context, *bbolt.Tx, *raft.Log, Req) (Resp, error)
    36  
    37  // StateRestorer is called during the FSM initialization
    38  // to restore the state from a snapshot.
    39  // The implementation MUST be idempotent.
    40  type StateRestorer interface {
    41  	// Init is provided with a write transaction to initialize the state.
    42  	// FSM guarantees that Init is called synchronously and has exclusive
    43  	// access to the database.
    44  	Init(*bbolt.Tx) error
    45  	// Restore is provided with a read transaction to restore the state.
    46  	// Restore might be called concurrently with other StateRestorer
    47  	// instances.
    48  	Restore(*bbolt.Tx) error
    49  }
    50  
    51  type Config struct {
    52  	SnapshotCompression      string `yaml:"snapshot_compression"`
    53  	SnapshotRateLimit        int    `yaml:"snapshot_rate_limit"`
    54  	SnapshotCompactOnRestore bool   `yaml:"snapshot_compact_on_restore"`
    55  	// Where the FSM BoltDB data is located.
    56  	// Does not have to be a persistent volume.
    57  	DataDir string `yaml:"data_dir"`
    58  }
    59  
    60  func (cfg *Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) {
    61  	f.StringVar(&cfg.SnapshotCompression, prefix+"snapshot-compression", "zstd", "Compression algorithm to use for snapshots. Supported compressions: zstd.")
    62  	f.IntVar(&cfg.SnapshotRateLimit, prefix+"snapshot-rate-limit", 15, "Rate limit for snapshot writer in MB/s.")
    63  	f.BoolVar(&cfg.SnapshotCompactOnRestore, prefix+"snapshot-compact-on-restore", false, "Compact the database on restore.")
    64  	f.StringVar(&cfg.DataDir, prefix+"data-dir", "./data-metastore/data", "Directory to store the data.")
    65  }
    66  
    67  // FSM implements the raft.FSM interface.
    68  type FSM struct {
    69  	logger          log.Logger
    70  	config          Config
    71  	contextRegistry ContextRegistry
    72  	metrics         *metrics
    73  
    74  	mu   sync.RWMutex
    75  	txns sync.WaitGroup
    76  	db   *boltdb
    77  
    78  	handlers  map[RaftLogEntryType]handler
    79  	restorers []StateRestorer
    80  
    81  	appliedTerm  uint64
    82  	appliedIndex uint64
    83  }
    84  
    85  type handler func(ctx context.Context, tx *tracingTx, cmd *raft.Log, raw []byte) (proto.Message, error)
    86  
    87  func New(logger log.Logger, reg prometheus.Registerer, config Config, contextRegistry ContextRegistry) (*FSM, error) {
    88  	fsm := FSM{
    89  		logger:          logger,
    90  		config:          config,
    91  		contextRegistry: contextRegistry,
    92  		metrics:         newMetrics(reg),
    93  		handlers:        make(map[RaftLogEntryType]handler),
    94  	}
    95  	db := newDB(logger, fsm.metrics, config)
    96  	if err := db.open(false); err != nil {
    97  		return nil, err
    98  	}
    99  	fsm.db = db
   100  	return &fsm, nil
   101  }
   102  
   103  func (fsm *FSM) RegisterRestorer(r ...StateRestorer) {
   104  	fsm.restorers = append(fsm.restorers, r...)
   105  }
   106  
   107  func RegisterRaftCommandHandler[Req, Resp proto.Message](fsm *FSM, t RaftLogEntryType, handler RaftHandler[Req, Resp]) {
   108  	fsm.handlers[t] = func(ctx context.Context, tx *tracingTx, cmd *raft.Log, raw []byte) (proto.Message, error) {
   109  		req, err := unmarshal[Req](raw)
   110  		if err != nil {
   111  			return nil, err
   112  		}
   113  		return handler(ctx, tx.Tx, cmd, req)
   114  	}
   115  }
   116  
   117  // Init must be called after the FSM is created and all restorers are registered.
   118  func (fsm *FSM) Init() error {
   119  	if err := fsm.init(); err != nil {
   120  		return fmt.Errorf("failed to initialize state: %w", err)
   121  	}
   122  	if err := fsm.restore(); err != nil {
   123  		return fmt.Errorf("failed to restore state: %w", err)
   124  	}
   125  	return nil
   126  }
   127  
   128  func (fsm *FSM) init() (err error) {
   129  	tx, err := fsm.db.boltdb.Begin(true)
   130  	if err != nil {
   131  		return err
   132  	}
   133  	defer func() {
   134  		if err == nil {
   135  			err = tx.Commit()
   136  		} else {
   137  			_ = tx.Rollback()
   138  		}
   139  	}()
   140  	if err = fsm.initRaftBucket(tx); err != nil {
   141  		return fmt.Errorf("failed to init raft bucket: %w", err)
   142  	}
   143  	for _, r := range fsm.restorers {
   144  		if err = r.Init(tx); err != nil {
   145  			return err
   146  		}
   147  	}
   148  	return nil
   149  }
   150  
   151  func (fsm *FSM) restore() error {
   152  	if err := fsm.db.boltdb.View(fsm.loadAppliedIndex); err != nil {
   153  		return fmt.Errorf("failed to load applied index: %w", err)
   154  	}
   155  	level.Info(fsm.logger).Log("msg", "restoring state", "term", fsm.appliedTerm, "applied_index", fsm.appliedIndex)
   156  	g, _ := errgroup.WithContext(context.Background())
   157  	for _, r := range fsm.restorers {
   158  		g.Go(func() error {
   159  			return fsm.db.boltdb.View(r.Restore)
   160  		})
   161  	}
   162  	return g.Wait()
   163  }
   164  
   165  // Restore restores the FSM state from a snapshot.
   166  func (fsm *FSM) Restore(snapshot io.ReadCloser) (err error) {
   167  	start := time.Now()
   168  	level.Info(fsm.logger).Log("msg", "restoring snapshot")
   169  	defer func() {
   170  		_ = snapshot.Close()
   171  		fsm.db.metrics.fsmRestoreSnapshotDuration.Observe(time.Since(start).Seconds())
   172  	}()
   173  
   174  	var r *snapshotReader
   175  	if r, err = newSnapshotReader(snapshot); err != nil {
   176  		level.Error(fsm.logger).Log("msg", "failed to create snapshot reader", "err", err)
   177  		return err
   178  	}
   179  	// The wrapper never returns errors on Close.
   180  	defer r.Close()
   181  
   182  	// Block all new transactions until we restore the snapshot.
   183  	// TODO(kolesnikovae): set not-serving service status to not
   184  	//  block incoming requests.
   185  	fsm.mu.Lock()
   186  	defer fsm.mu.Unlock()
   187  	fsm.txns.Wait()
   188  	if err = fsm.db.restore(r); err != nil {
   189  		level.Error(fsm.logger).Log("msg", "failed to restore database from snapshot", "err", err)
   190  		return err
   191  	}
   192  	// First we need to initialize the state: each restorer is called
   193  	// synchronously and has exclusive access to the database.
   194  	if err = fsm.init(); err != nil {
   195  		level.Error(fsm.logger).Log("msg", "failed to init state at restore", "err", err)
   196  		return err
   197  	}
   198  	// Then we restore the state: each restorer is given its own
   199  	// transaction and run concurrently with others.
   200  	if err = fsm.restore(); err != nil {
   201  		level.Error(fsm.logger).Log("msg", "failed to restore state from snapshot", "err", err)
   202  		return err
   203  	}
   204  	return nil
   205  }
   206  
   207  type fsmError struct {
   208  	cmd *raft.Log
   209  	err error
   210  }
   211  
   212  func errResponse(cmd *raft.Log, err error) Response {
   213  	return Response{Err: &fsmError{cmd: cmd, err: err}}
   214  }
   215  
   216  func (e *fsmError) Error() string {
   217  	if e.err == nil {
   218  		return ""
   219  	}
   220  	if e.cmd == nil {
   221  		return e.err.Error()
   222  	}
   223  	return fmt.Sprintf("term: %d; index: %d; appended_at: %v; error: %v",
   224  		e.cmd.Index, e.cmd.Term, e.cmd.AppendedAt, e.err)
   225  }
   226  
   227  func (fsm *FSM) Apply(log *raft.Log) any {
   228  	switch log.Type {
   229  	case raft.LogNoop:
   230  	case raft.LogBarrier:
   231  	case raft.LogConfiguration:
   232  	case raft.LogCommand:
   233  		return fsm.applyCommand(log)
   234  	default:
   235  		level.Warn(fsm.logger).Log("msg", "unexpected log entry, ignoring", "type", log.Type.String())
   236  	}
   237  	return nil
   238  }
   239  
   240  // applyCommand receives raw command from the raft log (FSM.Apply),
   241  // and calls the corresponding handler on the _local_ FSM, based on
   242  // the command type.
   243  func (fsm *FSM) applyCommand(cmd *raft.Log) any {
   244  	start := time.Now()
   245  	var e RaftLogEntry
   246  	if err := e.UnmarshalBinary(cmd.Data); err != nil {
   247  		return errResponse(cmd, err)
   248  	}
   249  
   250  	ctx := context.Background()
   251  	if ctxID := string(cmd.Extensions); ctxID != "" {
   252  		var found bool
   253  		if ctx, found = fsm.contextRegistry.Retrieve(ctxID); found {
   254  			defer fsm.contextRegistry.Delete(ctxID)
   255  		}
   256  	}
   257  
   258  	span, ctx := tracing.StartSpanFromContext(ctx, "fsm.applyCommand")
   259  	defer span.Finish()
   260  
   261  	if cmd.Index <= fsm.appliedIndex {
   262  		// Skip already applied commands at WAL restore.
   263  		// Note that the 0 index is a noop and is never applied to FSM.
   264  		return Response{} // todo this may result in nil deref if client does not exepect Response.Data to be nil , for example (svc *CompactionService) PollCompactionJobs
   265  	}
   266  
   267  	cmdType := strconv.FormatUint(uint64(e.Type), 10)
   268  	fsm.db.metrics.fsmApplyCommandSize.WithLabelValues(cmdType).Observe(float64(len(cmd.Data)))
   269  	defer func() {
   270  		fsm.db.metrics.fsmApplyCommandDuration.WithLabelValues(cmdType).Observe(time.Since(start).Seconds())
   271  	}()
   272  
   273  	handle, ok := fsm.handlers[e.Type]
   274  	if !ok {
   275  		return errResponse(cmd, fmt.Errorf("unknown command type: %d", e.Type))
   276  	}
   277  
   278  	// Apply is never called concurrently with Restore, so we don't need
   279  	// to lock the FSM: db.boltdb is guaranteed to be in a consistent state.
   280  	rawTx, err := fsm.db.boltdb.Begin(true)
   281  	if err != nil {
   282  		panic(fmt.Sprint("failed to begin write transaction:", err))
   283  	}
   284  
   285  	txSpan, ctx := opentracing.StartSpanFromContext(ctx, "boltdb.transaction")
   286  	txSpan.SetTag("writable", rawTx.Writable())
   287  	tx := newTracingTx(rawTx, txSpan, ctx)
   288  
   289  	data, err := handle(ctx, tx, cmd, e.Data)
   290  	if err != nil {
   291  		_ = tx.Rollback()
   292  		// NOTE(kolesnikovae): This has to be a hard failure as we assume
   293  		// that the in-memory state might have not been rolled back properly.
   294  		panic(fmt.Sprint("failed to apply command:", err))
   295  	}
   296  
   297  	if err = fsm.storeAppliedIndex(tx.Tx, cmd.Term, cmd.Index); err != nil {
   298  		panic(fmt.Sprint("failed to store applied index: %w", err))
   299  	}
   300  
   301  	// We can't do anything about the failure at the database level, so we
   302  	// panic here in a hope that other instances will handle the command.
   303  	if err = tx.Commit(); err != nil {
   304  		panic(fmt.Sprint("failed to commit transaction:", err))
   305  	}
   306  
   307  	return Response{Data: data, Err: err}
   308  }
   309  
   310  func (fsm *FSM) Read(fn func(*bbolt.Tx)) error {
   311  	fsm.mu.RLock()
   312  	tx, err := fsm.db.boltdb.Begin(false)
   313  	fsm.txns.Add(1)
   314  	fsm.mu.RUnlock()
   315  	if err != nil {
   316  		fsm.txns.Done()
   317  		return fmt.Errorf("failed to begin read transaction: %w", err)
   318  	}
   319  	defer func() {
   320  		_ = tx.Rollback()
   321  		fsm.txns.Done()
   322  	}()
   323  	fn(tx)
   324  	return nil
   325  }
   326  
   327  func (fsm *FSM) Snapshot() (raft.FSMSnapshot, error) {
   328  	// Snapshot should only capture a pointer to the state, and any
   329  	// expensive IO should happen as part of FSMSnapshot.Persist.
   330  	s := snapshotWriter{
   331  		logger:      fsm.logger,
   332  		metrics:     fsm.metrics,
   333  		compression: fsm.config.SnapshotCompression,
   334  		rate:        fsm.config.SnapshotRateLimit,
   335  	}
   336  	tx, err := fsm.db.boltdb.Begin(false)
   337  	if err != nil {
   338  		return nil, fmt.Errorf("failed to open a transaction for snapshot: %w", err)
   339  	}
   340  	s.tx = tx
   341  	return &s, nil
   342  }
   343  
   344  func (fsm *FSM) Shutdown() {
   345  	if fsm.db.boltdb != nil {
   346  		fsm.db.shutdown()
   347  	}
   348  }
   349  
   350  var (
   351  	raftBucketName  = []byte("raft")
   352  	appliedIndexKey = []byte("term.applied_index")
   353  	// Value is encoded as [8]term + [8]index.
   354  )
   355  
   356  func (fsm *FSM) initRaftBucket(tx *bbolt.Tx) error {
   357  	b := tx.Bucket(raftBucketName)
   358  	if b != nil {
   359  		return nil
   360  	}
   361  	// If no bucket exists, we create a stub with 0 values.
   362  	if _, err := tx.CreateBucket(raftBucketName); err != nil {
   363  		return err
   364  	}
   365  	return fsm.storeAppliedIndex(tx, 0, 0)
   366  }
   367  
   368  func (fsm *FSM) storeAppliedIndex(tx *bbolt.Tx, term, index uint64) error {
   369  	b := tx.Bucket(raftBucketName)
   370  	if b == nil {
   371  		return errors.ErrBucketNotFound
   372  	}
   373  	v := make([]byte, 16)
   374  	binary.BigEndian.PutUint64(v[0:8], term)
   375  	binary.BigEndian.PutUint64(v[8:16], index)
   376  	fsm.appliedTerm = term
   377  	fsm.appliedIndex = index
   378  	return b.Put(appliedIndexKey, v)
   379  }
   380  
   381  var errAppliedIndexInvalid = fmt.Errorf("invalid applied index")
   382  
   383  func (fsm *FSM) loadAppliedIndex(tx *bbolt.Tx) error {
   384  	b := tx.Bucket(raftBucketName)
   385  	if b == nil {
   386  		return errors.ErrBucketNotFound
   387  	}
   388  	v := b.Get(appliedIndexKey)
   389  	if len(v) < 16 {
   390  		return errAppliedIndexInvalid
   391  	}
   392  	fsm.appliedTerm = binary.BigEndian.Uint64(v[0:8])
   393  	fsm.appliedIndex = binary.BigEndian.Uint64(v[8:16])
   394  	return nil
   395  }