github.com/MetalBlockchain/metalgo@v1.11.9/snow/engine/avalanche/bootstrap/queue/jobs.go (about)

     1  // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
     2  // See the file LICENSE for licensing terms.
     3  
     4  package queue
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  	"time"
    11  
    12  	"github.com/prometheus/client_golang/prometheus"
    13  	"go.uber.org/zap"
    14  
    15  	"github.com/MetalBlockchain/metalgo/database"
    16  	"github.com/MetalBlockchain/metalgo/database/versiondb"
    17  	"github.com/MetalBlockchain/metalgo/ids"
    18  	"github.com/MetalBlockchain/metalgo/snow"
    19  	"github.com/MetalBlockchain/metalgo/snow/engine/common"
    20  	"github.com/MetalBlockchain/metalgo/utils/set"
    21  	"github.com/MetalBlockchain/metalgo/utils/timer"
    22  )
    23  
    24  const progressUpdateFrequency = 30 * time.Second
    25  
    26  // Jobs tracks a series of jobs that form a DAG of dependencies.
    27  type Jobs struct {
    28  	// db ensures that database updates are atomically updated.
    29  	db *versiondb.Database
    30  	// state writes the job queue to [db].
    31  	state *state
    32  	// Measures the ETA until bootstrapping finishes in nanoseconds.
    33  	etaMetric prometheus.Gauge
    34  }
    35  
    36  // New attempts to create a new job queue from the provided database.
    37  func New(
    38  	db database.Database,
    39  	metricsNamespace string,
    40  	metricsRegisterer prometheus.Registerer,
    41  ) (*Jobs, error) {
    42  	vdb := versiondb.New(db)
    43  	state, err := newState(vdb, metricsNamespace, metricsRegisterer)
    44  	if err != nil {
    45  		return nil, fmt.Errorf("couldn't create new jobs state: %w", err)
    46  	}
    47  
    48  	etaMetric := prometheus.NewGauge(prometheus.GaugeOpts{
    49  		Namespace: metricsNamespace,
    50  		Name:      "eta_execution_complete",
    51  		Help:      "ETA in nanoseconds until execution phase of bootstrapping finishes",
    52  	})
    53  
    54  	return &Jobs{
    55  		db:        vdb,
    56  		state:     state,
    57  		etaMetric: etaMetric,
    58  	}, metricsRegisterer.Register(etaMetric)
    59  }
    60  
    61  // SetParser tells this job queue how to parse jobs from the database.
    62  func (j *Jobs) SetParser(parser Parser) error {
    63  	j.state.parser = parser
    64  	return nil
    65  }
    66  
    67  func (j *Jobs) Has(jobID ids.ID) (bool, error) {
    68  	return j.state.HasJob(jobID)
    69  }
    70  
    71  // Returns how many pending jobs are waiting in the queue.
    72  func (j *Jobs) PendingJobs() uint64 {
    73  	return j.state.numJobs
    74  }
    75  
    76  // Push adds a new job to the queue. Returns true if [job] was added to the queue and false
    77  // if [job] was already in the queue.
    78  func (j *Jobs) Push(ctx context.Context, job Job) (bool, error) {
    79  	jobID := job.ID()
    80  	if has, err := j.state.HasJob(jobID); err != nil {
    81  		return false, fmt.Errorf("failed to check for existing job %s due to %w", jobID, err)
    82  	} else if has {
    83  		return false, nil
    84  	}
    85  
    86  	deps, err := job.MissingDependencies(ctx)
    87  	if err != nil {
    88  		return false, err
    89  	}
    90  	// Store this job into the database.
    91  	if err := j.state.PutJob(job); err != nil {
    92  		return false, fmt.Errorf("failed to write job due to %w", err)
    93  	}
    94  
    95  	if deps.Len() != 0 {
    96  		// This job needs to block on a set of dependencies.
    97  		for depID := range deps {
    98  			if err := j.state.AddDependency(depID, jobID); err != nil {
    99  				return false, fmt.Errorf("failed to add blocking for depID %s, jobID %s", depID, jobID)
   100  			}
   101  		}
   102  		return true, nil
   103  	}
   104  	// This job doesn't have any dependencies, so it should be placed onto the
   105  	// executable stack.
   106  	if err := j.state.AddRunnableJob(jobID); err != nil {
   107  		return false, fmt.Errorf("failed to add %s as a runnable job due to %w", jobID, err)
   108  	}
   109  	return true, nil
   110  }
   111  
   112  func (j *Jobs) ExecuteAll(
   113  	ctx context.Context,
   114  	chainCtx *snow.ConsensusContext,
   115  	halter common.Haltable,
   116  	restarted bool,
   117  	acceptors ...snow.Acceptor,
   118  ) (int, error) {
   119  	chainCtx.Executing.Set(true)
   120  	defer chainCtx.Executing.Set(false)
   121  
   122  	numExecuted := 0
   123  	numToExecute := j.state.numJobs
   124  	startTime := time.Now()
   125  	lastProgressUpdate := startTime
   126  
   127  	// Disable and clear state caches to prevent us from attempting to execute
   128  	// a vertex that was previously parsed, but not saved to the VM. Some VMs
   129  	// may only persist containers when they are accepted. This is a stop-gap
   130  	// measure to ensure the job will be re-parsed before executing until the VM
   131  	// provides a more explicit interface for freeing parsed blocks.
   132  	// TODO remove DisableCaching when VM provides better interface for freeing
   133  	// blocks.
   134  	j.state.DisableCaching()
   135  	for {
   136  		if halter.Halted() {
   137  			chainCtx.Log.Info("interrupted execution",
   138  				zap.Int("numExecuted", numExecuted),
   139  			)
   140  			return numExecuted, nil
   141  		}
   142  
   143  		job, err := j.state.RemoveRunnableJob(ctx)
   144  		if err == database.ErrNotFound {
   145  			break
   146  		}
   147  		if err != nil {
   148  			return 0, fmt.Errorf("failed to removing runnable job with %w", err)
   149  		}
   150  
   151  		jobID := job.ID()
   152  		chainCtx.Log.Debug("executing",
   153  			zap.Stringer("jobID", jobID),
   154  		)
   155  		jobBytes := job.Bytes()
   156  		// Note that acceptor.Accept must be called before executing [job] to
   157  		// honor Acceptor.Accept's invariant.
   158  		for _, acceptor := range acceptors {
   159  			if err := acceptor.Accept(chainCtx, jobID, jobBytes); err != nil {
   160  				return numExecuted, err
   161  			}
   162  		}
   163  		if err := job.Execute(ctx); err != nil {
   164  			return 0, fmt.Errorf("failed to execute job %s due to %w", jobID, err)
   165  		}
   166  
   167  		dependentIDs, err := j.state.RemoveDependencies(jobID)
   168  		if err != nil {
   169  			return 0, fmt.Errorf("failed to remove blocking jobs for %s due to %w", jobID, err)
   170  		}
   171  
   172  		for _, dependentID := range dependentIDs {
   173  			job, err := j.state.GetJob(ctx, dependentID)
   174  			if err != nil {
   175  				return 0, fmt.Errorf("failed to get job %s from blocking jobs due to %w", dependentID, err)
   176  			}
   177  			hasMissingDeps, err := job.HasMissingDependencies(ctx)
   178  			if err != nil {
   179  				return 0, fmt.Errorf("failed to get missing dependencies for %s due to %w", dependentID, err)
   180  			}
   181  			if hasMissingDeps {
   182  				continue
   183  			}
   184  			if err := j.state.AddRunnableJob(dependentID); err != nil {
   185  				return 0, fmt.Errorf("failed to add %s as a runnable job due to %w", dependentID, err)
   186  			}
   187  		}
   188  		if err := j.Commit(); err != nil {
   189  			return 0, err
   190  		}
   191  
   192  		numExecuted++
   193  		if time.Since(lastProgressUpdate) > progressUpdateFrequency { // Periodically print progress
   194  			eta := timer.EstimateETA(
   195  				startTime,
   196  				uint64(numExecuted),
   197  				numToExecute,
   198  			)
   199  			j.etaMetric.Set(float64(eta))
   200  
   201  			if !restarted {
   202  				chainCtx.Log.Info("executing operations",
   203  					zap.Int("numExecuted", numExecuted),
   204  					zap.Uint64("numToExecute", numToExecute),
   205  					zap.Duration("eta", eta),
   206  				)
   207  			} else {
   208  				chainCtx.Log.Debug("executing operations",
   209  					zap.Int("numExecuted", numExecuted),
   210  					zap.Uint64("numToExecute", numToExecute),
   211  					zap.Duration("eta", eta),
   212  				)
   213  			}
   214  
   215  			lastProgressUpdate = time.Now()
   216  		}
   217  	}
   218  
   219  	// Now that executing has finished, zero out the ETA.
   220  	j.etaMetric.Set(0)
   221  
   222  	if !restarted {
   223  		chainCtx.Log.Info("executed operations",
   224  			zap.Int("numExecuted", numExecuted),
   225  		)
   226  	} else {
   227  		chainCtx.Log.Debug("executed operations",
   228  			zap.Int("numExecuted", numExecuted),
   229  		)
   230  	}
   231  	return numExecuted, nil
   232  }
   233  
   234  func (j *Jobs) Clear() error {
   235  	return j.state.Clear()
   236  }
   237  
   238  // Commit the versionDB to the underlying database.
   239  func (j *Jobs) Commit() error {
   240  	return j.db.Commit()
   241  }
   242  
   243  type JobsWithMissing struct {
   244  	*Jobs
   245  
   246  	// keep the missing ID set in memory to avoid unnecessary database reads and
   247  	// writes.
   248  	missingIDs                            set.Set[ids.ID]
   249  	removeFromMissingIDs, addToMissingIDs set.Set[ids.ID]
   250  }
   251  
   252  func NewWithMissing(
   253  	db database.Database,
   254  	metricsNamespace string,
   255  	metricsRegisterer prometheus.Registerer,
   256  ) (*JobsWithMissing, error) {
   257  	innerJobs, err := New(db, metricsNamespace, metricsRegisterer)
   258  	if err != nil {
   259  		return nil, err
   260  	}
   261  
   262  	jobs := &JobsWithMissing{
   263  		Jobs: innerJobs,
   264  	}
   265  
   266  	missingIDs, err := jobs.state.MissingJobIDs()
   267  	jobs.missingIDs.Add(missingIDs...)
   268  	return jobs, err
   269  }
   270  
   271  // SetParser tells this job queue how to parse jobs from the database.
   272  func (jm *JobsWithMissing) SetParser(ctx context.Context, parser Parser) error {
   273  	jm.state.parser = parser
   274  	return jm.cleanRunnableStack(ctx)
   275  }
   276  
   277  func (jm *JobsWithMissing) Clear() error {
   278  	if err := jm.state.RemoveMissingJobIDs(jm.missingIDs); err != nil {
   279  		return err
   280  	}
   281  
   282  	jm.missingIDs.Clear()
   283  	jm.addToMissingIDs.Clear()
   284  	jm.removeFromMissingIDs.Clear()
   285  
   286  	return jm.Jobs.Clear()
   287  }
   288  
   289  func (jm *JobsWithMissing) Has(jobID ids.ID) (bool, error) {
   290  	if jm.missingIDs.Contains(jobID) {
   291  		return false, nil
   292  	}
   293  
   294  	return jm.Jobs.Has(jobID)
   295  }
   296  
   297  // Push adds a new job to the queue. Returns true if [job] was added to the queue and false
   298  // if [job] was already in the queue.
   299  func (jm *JobsWithMissing) Push(ctx context.Context, job Job) (bool, error) {
   300  	jobID := job.ID()
   301  	if has, err := jm.Has(jobID); err != nil {
   302  		return false, fmt.Errorf("failed to check for existing job %s due to %w", jobID, err)
   303  	} else if has {
   304  		return false, nil
   305  	}
   306  
   307  	deps, err := job.MissingDependencies(ctx)
   308  	if err != nil {
   309  		return false, err
   310  	}
   311  	// Store this job into the database.
   312  	if err := jm.state.PutJob(job); err != nil {
   313  		return false, fmt.Errorf("failed to write job due to %w", err)
   314  	}
   315  
   316  	if deps.Len() != 0 {
   317  		// This job needs to block on a set of dependencies.
   318  		for depID := range deps {
   319  			if err := jm.state.AddDependency(depID, jobID); err != nil {
   320  				return false, fmt.Errorf("failed to add blocking for depID %s, jobID %s", depID, jobID)
   321  			}
   322  		}
   323  		return true, nil
   324  	}
   325  	// This job doesn't have any dependencies, so it should be placed onto the
   326  	// executable stack.
   327  	if err := jm.state.AddRunnableJob(jobID); err != nil {
   328  		return false, fmt.Errorf("failed to add %s as a runnable job due to %w", jobID, err)
   329  	}
   330  	return true, nil
   331  }
   332  
   333  // AddMissingID adds [jobID] to missingIDs
   334  func (jm *JobsWithMissing) AddMissingID(jobIDs ...ids.ID) {
   335  	for _, jobID := range jobIDs {
   336  		if !jm.missingIDs.Contains(jobID) {
   337  			jm.missingIDs.Add(jobID)
   338  			jm.addToMissingIDs.Add(jobID)
   339  			jm.removeFromMissingIDs.Remove(jobID)
   340  		}
   341  	}
   342  }
   343  
   344  // RemoveMissingID removes [jobID] from missingIDs
   345  func (jm *JobsWithMissing) RemoveMissingID(jobIDs ...ids.ID) {
   346  	for _, jobID := range jobIDs {
   347  		if jm.missingIDs.Contains(jobID) {
   348  			jm.missingIDs.Remove(jobID)
   349  			jm.addToMissingIDs.Remove(jobID)
   350  			jm.removeFromMissingIDs.Add(jobID)
   351  		}
   352  	}
   353  }
   354  
   355  func (jm *JobsWithMissing) MissingIDs() []ids.ID {
   356  	return jm.missingIDs.List()
   357  }
   358  
   359  func (jm *JobsWithMissing) NumMissingIDs() int {
   360  	return jm.missingIDs.Len()
   361  }
   362  
   363  // Commit the versionDB to the underlying database.
   364  func (jm *JobsWithMissing) Commit() error {
   365  	if jm.addToMissingIDs.Len() != 0 {
   366  		if err := jm.state.AddMissingJobIDs(jm.addToMissingIDs); err != nil {
   367  			return err
   368  		}
   369  		jm.addToMissingIDs.Clear()
   370  	}
   371  	if jm.removeFromMissingIDs.Len() != 0 {
   372  		if err := jm.state.RemoveMissingJobIDs(jm.removeFromMissingIDs); err != nil {
   373  			return err
   374  		}
   375  		jm.removeFromMissingIDs.Clear()
   376  	}
   377  	return jm.Jobs.Commit()
   378  }
   379  
   380  // cleanRunnableStack iterates over the jobs on the runnable stack and resets any job
   381  // that has missing dependencies to block on those dependencies.
   382  // Note: the jobs queue ensures that no job with missing dependencies will be placed
   383  // on the runnable stack in the first place.
   384  // However, for specific VM implementations blocks may be committed via a two stage commit
   385  // (ex. platformvm Proposal and Commit/Abort blocks). This can cause an issue where if the first stage
   386  // is executed immediately before the node dies, it will be removed from the runnable stack
   387  // without writing the state transition to the VM's database. When the node restarts, the
   388  // VM will not have marked the first block (the proposal block as accepted), but it could
   389  // have already been removed from the jobs queue. cleanRunnableStack handles this case.
   390  func (jm *JobsWithMissing) cleanRunnableStack(ctx context.Context) error {
   391  	runnableJobsIter := jm.state.runnableJobIDs.NewIterator()
   392  	defer runnableJobsIter.Release()
   393  
   394  	for runnableJobsIter.Next() {
   395  		jobIDBytes := runnableJobsIter.Key()
   396  		jobID, err := ids.ToID(jobIDBytes)
   397  		if err != nil {
   398  			return fmt.Errorf("failed to convert jobID bytes into ID due to: %w", err)
   399  		}
   400  
   401  		job, err := jm.state.GetJob(ctx, jobID)
   402  		if err != nil {
   403  			return fmt.Errorf("failed to retrieve job on runnnable stack due to: %w", err)
   404  		}
   405  		deps, err := job.MissingDependencies(ctx)
   406  		if err != nil {
   407  			return fmt.Errorf("failed to retrieve missing dependencies of job on runnable stack due to: %w", err)
   408  		}
   409  		if deps.Len() == 0 {
   410  			continue
   411  		}
   412  
   413  		// If the job has missing dependencies, remove it from the runnable stack
   414  		if err := jm.state.runnableJobIDs.Delete(jobIDBytes); err != nil {
   415  			return fmt.Errorf("failed to delete jobID from runnable stack due to: %w", err)
   416  		}
   417  
   418  		// Add the missing dependencies to the set that needs to be fetched.
   419  		jm.AddMissingID(deps.List()...)
   420  		for depID := range deps {
   421  			if err := jm.state.AddDependency(depID, jobID); err != nil {
   422  				return fmt.Errorf("failed to add blocking for depID %s, jobID %s while cleaning the runnable stack", depID, jobID)
   423  			}
   424  		}
   425  	}
   426  
   427  	return errors.Join(
   428  		runnableJobsIter.Error(),
   429  		jm.Commit(),
   430  	)
   431  }