github.com/grafana/pyroscope@v1.18.0/pkg/metastore/fsm/fsm.go (about) 1 package fsm 2 3 import ( 4 "context" 5 "encoding/binary" 6 "flag" 7 "fmt" 8 "io" 9 "strconv" 10 "sync" 11 "time" 12 13 "github.com/go-kit/log" 14 "github.com/go-kit/log/level" 15 "github.com/hashicorp/raft" 16 "github.com/opentracing/opentracing-go" 17 "github.com/prometheus/client_golang/prometheus" 18 "go.etcd.io/bbolt" 19 "go.etcd.io/bbolt/errors" 20 "golang.org/x/sync/errgroup" 21 "google.golang.org/protobuf/proto" 22 23 "github.com/grafana/pyroscope/pkg/metastore/tracing" 24 ) 25 26 type ContextRegistry interface { 27 Retrieve(id string) (context.Context, bool) 28 Delete(id string) 29 Size() int 30 } 31 32 // RaftHandler is a function that processes a Raft command. 33 // The implementation MUST be idempotent. 34 // The context parameter is used for tracing purposes and is only available on the leader. 35 type RaftHandler[Req, Resp proto.Message] func(context.Context, *bbolt.Tx, *raft.Log, Req) (Resp, error) 36 37 // StateRestorer is called during the FSM initialization 38 // to restore the state from a snapshot. 39 // The implementation MUST be idempotent. 40 type StateRestorer interface { 41 // Init is provided with a write transaction to initialize the state. 42 // FSM guarantees that Init is called synchronously and has exclusive 43 // access to the database. 44 Init(*bbolt.Tx) error 45 // Restore is provided with a read transaction to restore the state. 46 // Restore might be called concurrently with other StateRestorer 47 // instances. 48 Restore(*bbolt.Tx) error 49 } 50 51 type Config struct { 52 SnapshotCompression string `yaml:"snapshot_compression"` 53 SnapshotRateLimit int `yaml:"snapshot_rate_limit"` 54 SnapshotCompactOnRestore bool `yaml:"snapshot_compact_on_restore"` 55 // Where the FSM BoltDB data is located. 56 // Does not have to be a persistent volume. 57 DataDir string `yaml:"data_dir"` 58 } 59 60 func (cfg *Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) { 61 f.StringVar(&cfg.SnapshotCompression, prefix+"snapshot-compression", "zstd", "Compression algorithm to use for snapshots. Supported compressions: zstd.") 62 f.IntVar(&cfg.SnapshotRateLimit, prefix+"snapshot-rate-limit", 15, "Rate limit for snapshot writer in MB/s.") 63 f.BoolVar(&cfg.SnapshotCompactOnRestore, prefix+"snapshot-compact-on-restore", false, "Compact the database on restore.") 64 f.StringVar(&cfg.DataDir, prefix+"data-dir", "./data-metastore/data", "Directory to store the data.") 65 } 66 67 // FSM implements the raft.FSM interface. 68 type FSM struct { 69 logger log.Logger 70 config Config 71 contextRegistry ContextRegistry 72 metrics *metrics 73 74 mu sync.RWMutex 75 txns sync.WaitGroup 76 db *boltdb 77 78 handlers map[RaftLogEntryType]handler 79 restorers []StateRestorer 80 81 appliedTerm uint64 82 appliedIndex uint64 83 } 84 85 type handler func(ctx context.Context, tx *tracingTx, cmd *raft.Log, raw []byte) (proto.Message, error) 86 87 func New(logger log.Logger, reg prometheus.Registerer, config Config, contextRegistry ContextRegistry) (*FSM, error) { 88 fsm := FSM{ 89 logger: logger, 90 config: config, 91 contextRegistry: contextRegistry, 92 metrics: newMetrics(reg), 93 handlers: make(map[RaftLogEntryType]handler), 94 } 95 db := newDB(logger, fsm.metrics, config) 96 if err := db.open(false); err != nil { 97 return nil, err 98 } 99 fsm.db = db 100 return &fsm, nil 101 } 102 103 func (fsm *FSM) RegisterRestorer(r ...StateRestorer) { 104 fsm.restorers = append(fsm.restorers, r...) 105 } 106 107 func RegisterRaftCommandHandler[Req, Resp proto.Message](fsm *FSM, t RaftLogEntryType, handler RaftHandler[Req, Resp]) { 108 fsm.handlers[t] = func(ctx context.Context, tx *tracingTx, cmd *raft.Log, raw []byte) (proto.Message, error) { 109 req, err := unmarshal[Req](raw) 110 if err != nil { 111 return nil, err 112 } 113 return handler(ctx, tx.Tx, cmd, req) 114 } 115 } 116 117 // Init must be called after the FSM is created and all restorers are registered. 118 func (fsm *FSM) Init() error { 119 if err := fsm.init(); err != nil { 120 return fmt.Errorf("failed to initialize state: %w", err) 121 } 122 if err := fsm.restore(); err != nil { 123 return fmt.Errorf("failed to restore state: %w", err) 124 } 125 return nil 126 } 127 128 func (fsm *FSM) init() (err error) { 129 tx, err := fsm.db.boltdb.Begin(true) 130 if err != nil { 131 return err 132 } 133 defer func() { 134 if err == nil { 135 err = tx.Commit() 136 } else { 137 _ = tx.Rollback() 138 } 139 }() 140 if err = fsm.initRaftBucket(tx); err != nil { 141 return fmt.Errorf("failed to init raft bucket: %w", err) 142 } 143 for _, r := range fsm.restorers { 144 if err = r.Init(tx); err != nil { 145 return err 146 } 147 } 148 return nil 149 } 150 151 func (fsm *FSM) restore() error { 152 if err := fsm.db.boltdb.View(fsm.loadAppliedIndex); err != nil { 153 return fmt.Errorf("failed to load applied index: %w", err) 154 } 155 level.Info(fsm.logger).Log("msg", "restoring state", "term", fsm.appliedTerm, "applied_index", fsm.appliedIndex) 156 g, _ := errgroup.WithContext(context.Background()) 157 for _, r := range fsm.restorers { 158 g.Go(func() error { 159 return fsm.db.boltdb.View(r.Restore) 160 }) 161 } 162 return g.Wait() 163 } 164 165 // Restore restores the FSM state from a snapshot. 166 func (fsm *FSM) Restore(snapshot io.ReadCloser) (err error) { 167 start := time.Now() 168 level.Info(fsm.logger).Log("msg", "restoring snapshot") 169 defer func() { 170 _ = snapshot.Close() 171 fsm.db.metrics.fsmRestoreSnapshotDuration.Observe(time.Since(start).Seconds()) 172 }() 173 174 var r *snapshotReader 175 if r, err = newSnapshotReader(snapshot); err != nil { 176 level.Error(fsm.logger).Log("msg", "failed to create snapshot reader", "err", err) 177 return err 178 } 179 // The wrapper never returns errors on Close. 180 defer r.Close() 181 182 // Block all new transactions until we restore the snapshot. 183 // TODO(kolesnikovae): set not-serving service status to not 184 // block incoming requests. 185 fsm.mu.Lock() 186 defer fsm.mu.Unlock() 187 fsm.txns.Wait() 188 if err = fsm.db.restore(r); err != nil { 189 level.Error(fsm.logger).Log("msg", "failed to restore database from snapshot", "err", err) 190 return err 191 } 192 // First we need to initialize the state: each restorer is called 193 // synchronously and has exclusive access to the database. 194 if err = fsm.init(); err != nil { 195 level.Error(fsm.logger).Log("msg", "failed to init state at restore", "err", err) 196 return err 197 } 198 // Then we restore the state: each restorer is given its own 199 // transaction and run concurrently with others. 200 if err = fsm.restore(); err != nil { 201 level.Error(fsm.logger).Log("msg", "failed to restore state from snapshot", "err", err) 202 return err 203 } 204 return nil 205 } 206 207 type fsmError struct { 208 cmd *raft.Log 209 err error 210 } 211 212 func errResponse(cmd *raft.Log, err error) Response { 213 return Response{Err: &fsmError{cmd: cmd, err: err}} 214 } 215 216 func (e *fsmError) Error() string { 217 if e.err == nil { 218 return "" 219 } 220 if e.cmd == nil { 221 return e.err.Error() 222 } 223 return fmt.Sprintf("term: %d; index: %d; appended_at: %v; error: %v", 224 e.cmd.Index, e.cmd.Term, e.cmd.AppendedAt, e.err) 225 } 226 227 func (fsm *FSM) Apply(log *raft.Log) any { 228 switch log.Type { 229 case raft.LogNoop: 230 case raft.LogBarrier: 231 case raft.LogConfiguration: 232 case raft.LogCommand: 233 return fsm.applyCommand(log) 234 default: 235 level.Warn(fsm.logger).Log("msg", "unexpected log entry, ignoring", "type", log.Type.String()) 236 } 237 return nil 238 } 239 240 // applyCommand receives raw command from the raft log (FSM.Apply), 241 // and calls the corresponding handler on the _local_ FSM, based on 242 // the command type. 243 func (fsm *FSM) applyCommand(cmd *raft.Log) any { 244 start := time.Now() 245 var e RaftLogEntry 246 if err := e.UnmarshalBinary(cmd.Data); err != nil { 247 return errResponse(cmd, err) 248 } 249 250 ctx := context.Background() 251 if ctxID := string(cmd.Extensions); ctxID != "" { 252 var found bool 253 if ctx, found = fsm.contextRegistry.Retrieve(ctxID); found { 254 defer fsm.contextRegistry.Delete(ctxID) 255 } 256 } 257 258 span, ctx := tracing.StartSpanFromContext(ctx, "fsm.applyCommand") 259 defer span.Finish() 260 261 if cmd.Index <= fsm.appliedIndex { 262 // Skip already applied commands at WAL restore. 263 // Note that the 0 index is a noop and is never applied to FSM. 264 return Response{} // todo this may result in nil deref if client does not exepect Response.Data to be nil , for example (svc *CompactionService) PollCompactionJobs 265 } 266 267 cmdType := strconv.FormatUint(uint64(e.Type), 10) 268 fsm.db.metrics.fsmApplyCommandSize.WithLabelValues(cmdType).Observe(float64(len(cmd.Data))) 269 defer func() { 270 fsm.db.metrics.fsmApplyCommandDuration.WithLabelValues(cmdType).Observe(time.Since(start).Seconds()) 271 }() 272 273 handle, ok := fsm.handlers[e.Type] 274 if !ok { 275 return errResponse(cmd, fmt.Errorf("unknown command type: %d", e.Type)) 276 } 277 278 // Apply is never called concurrently with Restore, so we don't need 279 // to lock the FSM: db.boltdb is guaranteed to be in a consistent state. 280 rawTx, err := fsm.db.boltdb.Begin(true) 281 if err != nil { 282 panic(fmt.Sprint("failed to begin write transaction:", err)) 283 } 284 285 txSpan, ctx := opentracing.StartSpanFromContext(ctx, "boltdb.transaction") 286 txSpan.SetTag("writable", rawTx.Writable()) 287 tx := newTracingTx(rawTx, txSpan, ctx) 288 289 data, err := handle(ctx, tx, cmd, e.Data) 290 if err != nil { 291 _ = tx.Rollback() 292 // NOTE(kolesnikovae): This has to be a hard failure as we assume 293 // that the in-memory state might have not been rolled back properly. 294 panic(fmt.Sprint("failed to apply command:", err)) 295 } 296 297 if err = fsm.storeAppliedIndex(tx.Tx, cmd.Term, cmd.Index); err != nil { 298 panic(fmt.Sprint("failed to store applied index: %w", err)) 299 } 300 301 // We can't do anything about the failure at the database level, so we 302 // panic here in a hope that other instances will handle the command. 303 if err = tx.Commit(); err != nil { 304 panic(fmt.Sprint("failed to commit transaction:", err)) 305 } 306 307 return Response{Data: data, Err: err} 308 } 309 310 func (fsm *FSM) Read(fn func(*bbolt.Tx)) error { 311 fsm.mu.RLock() 312 tx, err := fsm.db.boltdb.Begin(false) 313 fsm.txns.Add(1) 314 fsm.mu.RUnlock() 315 if err != nil { 316 fsm.txns.Done() 317 return fmt.Errorf("failed to begin read transaction: %w", err) 318 } 319 defer func() { 320 _ = tx.Rollback() 321 fsm.txns.Done() 322 }() 323 fn(tx) 324 return nil 325 } 326 327 func (fsm *FSM) Snapshot() (raft.FSMSnapshot, error) { 328 // Snapshot should only capture a pointer to the state, and any 329 // expensive IO should happen as part of FSMSnapshot.Persist. 330 s := snapshotWriter{ 331 logger: fsm.logger, 332 metrics: fsm.metrics, 333 compression: fsm.config.SnapshotCompression, 334 rate: fsm.config.SnapshotRateLimit, 335 } 336 tx, err := fsm.db.boltdb.Begin(false) 337 if err != nil { 338 return nil, fmt.Errorf("failed to open a transaction for snapshot: %w", err) 339 } 340 s.tx = tx 341 return &s, nil 342 } 343 344 func (fsm *FSM) Shutdown() { 345 if fsm.db.boltdb != nil { 346 fsm.db.shutdown() 347 } 348 } 349 350 var ( 351 raftBucketName = []byte("raft") 352 appliedIndexKey = []byte("term.applied_index") 353 // Value is encoded as [8]term + [8]index. 354 ) 355 356 func (fsm *FSM) initRaftBucket(tx *bbolt.Tx) error { 357 b := tx.Bucket(raftBucketName) 358 if b != nil { 359 return nil 360 } 361 // If no bucket exists, we create a stub with 0 values. 362 if _, err := tx.CreateBucket(raftBucketName); err != nil { 363 return err 364 } 365 return fsm.storeAppliedIndex(tx, 0, 0) 366 } 367 368 func (fsm *FSM) storeAppliedIndex(tx *bbolt.Tx, term, index uint64) error { 369 b := tx.Bucket(raftBucketName) 370 if b == nil { 371 return errors.ErrBucketNotFound 372 } 373 v := make([]byte, 16) 374 binary.BigEndian.PutUint64(v[0:8], term) 375 binary.BigEndian.PutUint64(v[8:16], index) 376 fsm.appliedTerm = term 377 fsm.appliedIndex = index 378 return b.Put(appliedIndexKey, v) 379 } 380 381 var errAppliedIndexInvalid = fmt.Errorf("invalid applied index") 382 383 func (fsm *FSM) loadAppliedIndex(tx *bbolt.Tx) error { 384 b := tx.Bucket(raftBucketName) 385 if b == nil { 386 return errors.ErrBucketNotFound 387 } 388 v := b.Get(appliedIndexKey) 389 if len(v) < 16 { 390 return errAppliedIndexInvalid 391 } 392 fsm.appliedTerm = binary.BigEndian.Uint64(v[0:8]) 393 fsm.appliedIndex = binary.BigEndian.Uint64(v[8:16]) 394 return nil 395 }