github.com/MetalBlockchain/metalgo@v1.11.9/snow/engine/avalanche/bootstrap/queue/jobs.go (about) 1 // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. 2 // See the file LICENSE for licensing terms. 3 4 package queue 5 6 import ( 7 "context" 8 "errors" 9 "fmt" 10 "time" 11 12 "github.com/prometheus/client_golang/prometheus" 13 "go.uber.org/zap" 14 15 "github.com/MetalBlockchain/metalgo/database" 16 "github.com/MetalBlockchain/metalgo/database/versiondb" 17 "github.com/MetalBlockchain/metalgo/ids" 18 "github.com/MetalBlockchain/metalgo/snow" 19 "github.com/MetalBlockchain/metalgo/snow/engine/common" 20 "github.com/MetalBlockchain/metalgo/utils/set" 21 "github.com/MetalBlockchain/metalgo/utils/timer" 22 ) 23 24 const progressUpdateFrequency = 30 * time.Second 25 26 // Jobs tracks a series of jobs that form a DAG of dependencies. 27 type Jobs struct { 28 // db ensures that database updates are atomically updated. 29 db *versiondb.Database 30 // state writes the job queue to [db]. 31 state *state 32 // Measures the ETA until bootstrapping finishes in nanoseconds. 33 etaMetric prometheus.Gauge 34 } 35 36 // New attempts to create a new job queue from the provided database. 37 func New( 38 db database.Database, 39 metricsNamespace string, 40 metricsRegisterer prometheus.Registerer, 41 ) (*Jobs, error) { 42 vdb := versiondb.New(db) 43 state, err := newState(vdb, metricsNamespace, metricsRegisterer) 44 if err != nil { 45 return nil, fmt.Errorf("couldn't create new jobs state: %w", err) 46 } 47 48 etaMetric := prometheus.NewGauge(prometheus.GaugeOpts{ 49 Namespace: metricsNamespace, 50 Name: "eta_execution_complete", 51 Help: "ETA in nanoseconds until execution phase of bootstrapping finishes", 52 }) 53 54 return &Jobs{ 55 db: vdb, 56 state: state, 57 etaMetric: etaMetric, 58 }, metricsRegisterer.Register(etaMetric) 59 } 60 61 // SetParser tells this job queue how to parse jobs from the database. 62 func (j *Jobs) SetParser(parser Parser) error { 63 j.state.parser = parser 64 return nil 65 } 66 67 func (j *Jobs) Has(jobID ids.ID) (bool, error) { 68 return j.state.HasJob(jobID) 69 } 70 71 // Returns how many pending jobs are waiting in the queue. 72 func (j *Jobs) PendingJobs() uint64 { 73 return j.state.numJobs 74 } 75 76 // Push adds a new job to the queue. Returns true if [job] was added to the queue and false 77 // if [job] was already in the queue. 78 func (j *Jobs) Push(ctx context.Context, job Job) (bool, error) { 79 jobID := job.ID() 80 if has, err := j.state.HasJob(jobID); err != nil { 81 return false, fmt.Errorf("failed to check for existing job %s due to %w", jobID, err) 82 } else if has { 83 return false, nil 84 } 85 86 deps, err := job.MissingDependencies(ctx) 87 if err != nil { 88 return false, err 89 } 90 // Store this job into the database. 91 if err := j.state.PutJob(job); err != nil { 92 return false, fmt.Errorf("failed to write job due to %w", err) 93 } 94 95 if deps.Len() != 0 { 96 // This job needs to block on a set of dependencies. 97 for depID := range deps { 98 if err := j.state.AddDependency(depID, jobID); err != nil { 99 return false, fmt.Errorf("failed to add blocking for depID %s, jobID %s", depID, jobID) 100 } 101 } 102 return true, nil 103 } 104 // This job doesn't have any dependencies, so it should be placed onto the 105 // executable stack. 106 if err := j.state.AddRunnableJob(jobID); err != nil { 107 return false, fmt.Errorf("failed to add %s as a runnable job due to %w", jobID, err) 108 } 109 return true, nil 110 } 111 112 func (j *Jobs) ExecuteAll( 113 ctx context.Context, 114 chainCtx *snow.ConsensusContext, 115 halter common.Haltable, 116 restarted bool, 117 acceptors ...snow.Acceptor, 118 ) (int, error) { 119 chainCtx.Executing.Set(true) 120 defer chainCtx.Executing.Set(false) 121 122 numExecuted := 0 123 numToExecute := j.state.numJobs 124 startTime := time.Now() 125 lastProgressUpdate := startTime 126 127 // Disable and clear state caches to prevent us from attempting to execute 128 // a vertex that was previously parsed, but not saved to the VM. Some VMs 129 // may only persist containers when they are accepted. This is a stop-gap 130 // measure to ensure the job will be re-parsed before executing until the VM 131 // provides a more explicit interface for freeing parsed blocks. 132 // TODO remove DisableCaching when VM provides better interface for freeing 133 // blocks. 134 j.state.DisableCaching() 135 for { 136 if halter.Halted() { 137 chainCtx.Log.Info("interrupted execution", 138 zap.Int("numExecuted", numExecuted), 139 ) 140 return numExecuted, nil 141 } 142 143 job, err := j.state.RemoveRunnableJob(ctx) 144 if err == database.ErrNotFound { 145 break 146 } 147 if err != nil { 148 return 0, fmt.Errorf("failed to removing runnable job with %w", err) 149 } 150 151 jobID := job.ID() 152 chainCtx.Log.Debug("executing", 153 zap.Stringer("jobID", jobID), 154 ) 155 jobBytes := job.Bytes() 156 // Note that acceptor.Accept must be called before executing [job] to 157 // honor Acceptor.Accept's invariant. 158 for _, acceptor := range acceptors { 159 if err := acceptor.Accept(chainCtx, jobID, jobBytes); err != nil { 160 return numExecuted, err 161 } 162 } 163 if err := job.Execute(ctx); err != nil { 164 return 0, fmt.Errorf("failed to execute job %s due to %w", jobID, err) 165 } 166 167 dependentIDs, err := j.state.RemoveDependencies(jobID) 168 if err != nil { 169 return 0, fmt.Errorf("failed to remove blocking jobs for %s due to %w", jobID, err) 170 } 171 172 for _, dependentID := range dependentIDs { 173 job, err := j.state.GetJob(ctx, dependentID) 174 if err != nil { 175 return 0, fmt.Errorf("failed to get job %s from blocking jobs due to %w", dependentID, err) 176 } 177 hasMissingDeps, err := job.HasMissingDependencies(ctx) 178 if err != nil { 179 return 0, fmt.Errorf("failed to get missing dependencies for %s due to %w", dependentID, err) 180 } 181 if hasMissingDeps { 182 continue 183 } 184 if err := j.state.AddRunnableJob(dependentID); err != nil { 185 return 0, fmt.Errorf("failed to add %s as a runnable job due to %w", dependentID, err) 186 } 187 } 188 if err := j.Commit(); err != nil { 189 return 0, err 190 } 191 192 numExecuted++ 193 if time.Since(lastProgressUpdate) > progressUpdateFrequency { // Periodically print progress 194 eta := timer.EstimateETA( 195 startTime, 196 uint64(numExecuted), 197 numToExecute, 198 ) 199 j.etaMetric.Set(float64(eta)) 200 201 if !restarted { 202 chainCtx.Log.Info("executing operations", 203 zap.Int("numExecuted", numExecuted), 204 zap.Uint64("numToExecute", numToExecute), 205 zap.Duration("eta", eta), 206 ) 207 } else { 208 chainCtx.Log.Debug("executing operations", 209 zap.Int("numExecuted", numExecuted), 210 zap.Uint64("numToExecute", numToExecute), 211 zap.Duration("eta", eta), 212 ) 213 } 214 215 lastProgressUpdate = time.Now() 216 } 217 } 218 219 // Now that executing has finished, zero out the ETA. 220 j.etaMetric.Set(0) 221 222 if !restarted { 223 chainCtx.Log.Info("executed operations", 224 zap.Int("numExecuted", numExecuted), 225 ) 226 } else { 227 chainCtx.Log.Debug("executed operations", 228 zap.Int("numExecuted", numExecuted), 229 ) 230 } 231 return numExecuted, nil 232 } 233 234 func (j *Jobs) Clear() error { 235 return j.state.Clear() 236 } 237 238 // Commit the versionDB to the underlying database. 239 func (j *Jobs) Commit() error { 240 return j.db.Commit() 241 } 242 243 type JobsWithMissing struct { 244 *Jobs 245 246 // keep the missing ID set in memory to avoid unnecessary database reads and 247 // writes. 248 missingIDs set.Set[ids.ID] 249 removeFromMissingIDs, addToMissingIDs set.Set[ids.ID] 250 } 251 252 func NewWithMissing( 253 db database.Database, 254 metricsNamespace string, 255 metricsRegisterer prometheus.Registerer, 256 ) (*JobsWithMissing, error) { 257 innerJobs, err := New(db, metricsNamespace, metricsRegisterer) 258 if err != nil { 259 return nil, err 260 } 261 262 jobs := &JobsWithMissing{ 263 Jobs: innerJobs, 264 } 265 266 missingIDs, err := jobs.state.MissingJobIDs() 267 jobs.missingIDs.Add(missingIDs...) 268 return jobs, err 269 } 270 271 // SetParser tells this job queue how to parse jobs from the database. 272 func (jm *JobsWithMissing) SetParser(ctx context.Context, parser Parser) error { 273 jm.state.parser = parser 274 return jm.cleanRunnableStack(ctx) 275 } 276 277 func (jm *JobsWithMissing) Clear() error { 278 if err := jm.state.RemoveMissingJobIDs(jm.missingIDs); err != nil { 279 return err 280 } 281 282 jm.missingIDs.Clear() 283 jm.addToMissingIDs.Clear() 284 jm.removeFromMissingIDs.Clear() 285 286 return jm.Jobs.Clear() 287 } 288 289 func (jm *JobsWithMissing) Has(jobID ids.ID) (bool, error) { 290 if jm.missingIDs.Contains(jobID) { 291 return false, nil 292 } 293 294 return jm.Jobs.Has(jobID) 295 } 296 297 // Push adds a new job to the queue. Returns true if [job] was added to the queue and false 298 // if [job] was already in the queue. 299 func (jm *JobsWithMissing) Push(ctx context.Context, job Job) (bool, error) { 300 jobID := job.ID() 301 if has, err := jm.Has(jobID); err != nil { 302 return false, fmt.Errorf("failed to check for existing job %s due to %w", jobID, err) 303 } else if has { 304 return false, nil 305 } 306 307 deps, err := job.MissingDependencies(ctx) 308 if err != nil { 309 return false, err 310 } 311 // Store this job into the database. 312 if err := jm.state.PutJob(job); err != nil { 313 return false, fmt.Errorf("failed to write job due to %w", err) 314 } 315 316 if deps.Len() != 0 { 317 // This job needs to block on a set of dependencies. 318 for depID := range deps { 319 if err := jm.state.AddDependency(depID, jobID); err != nil { 320 return false, fmt.Errorf("failed to add blocking for depID %s, jobID %s", depID, jobID) 321 } 322 } 323 return true, nil 324 } 325 // This job doesn't have any dependencies, so it should be placed onto the 326 // executable stack. 327 if err := jm.state.AddRunnableJob(jobID); err != nil { 328 return false, fmt.Errorf("failed to add %s as a runnable job due to %w", jobID, err) 329 } 330 return true, nil 331 } 332 333 // AddMissingID adds [jobID] to missingIDs 334 func (jm *JobsWithMissing) AddMissingID(jobIDs ...ids.ID) { 335 for _, jobID := range jobIDs { 336 if !jm.missingIDs.Contains(jobID) { 337 jm.missingIDs.Add(jobID) 338 jm.addToMissingIDs.Add(jobID) 339 jm.removeFromMissingIDs.Remove(jobID) 340 } 341 } 342 } 343 344 // RemoveMissingID removes [jobID] from missingIDs 345 func (jm *JobsWithMissing) RemoveMissingID(jobIDs ...ids.ID) { 346 for _, jobID := range jobIDs { 347 if jm.missingIDs.Contains(jobID) { 348 jm.missingIDs.Remove(jobID) 349 jm.addToMissingIDs.Remove(jobID) 350 jm.removeFromMissingIDs.Add(jobID) 351 } 352 } 353 } 354 355 func (jm *JobsWithMissing) MissingIDs() []ids.ID { 356 return jm.missingIDs.List() 357 } 358 359 func (jm *JobsWithMissing) NumMissingIDs() int { 360 return jm.missingIDs.Len() 361 } 362 363 // Commit the versionDB to the underlying database. 364 func (jm *JobsWithMissing) Commit() error { 365 if jm.addToMissingIDs.Len() != 0 { 366 if err := jm.state.AddMissingJobIDs(jm.addToMissingIDs); err != nil { 367 return err 368 } 369 jm.addToMissingIDs.Clear() 370 } 371 if jm.removeFromMissingIDs.Len() != 0 { 372 if err := jm.state.RemoveMissingJobIDs(jm.removeFromMissingIDs); err != nil { 373 return err 374 } 375 jm.removeFromMissingIDs.Clear() 376 } 377 return jm.Jobs.Commit() 378 } 379 380 // cleanRunnableStack iterates over the jobs on the runnable stack and resets any job 381 // that has missing dependencies to block on those dependencies. 382 // Note: the jobs queue ensures that no job with missing dependencies will be placed 383 // on the runnable stack in the first place. 384 // However, for specific VM implementations blocks may be committed via a two stage commit 385 // (ex. platformvm Proposal and Commit/Abort blocks). This can cause an issue where if the first stage 386 // is executed immediately before the node dies, it will be removed from the runnable stack 387 // without writing the state transition to the VM's database. When the node restarts, the 388 // VM will not have marked the first block (the proposal block as accepted), but it could 389 // have already been removed from the jobs queue. cleanRunnableStack handles this case. 390 func (jm *JobsWithMissing) cleanRunnableStack(ctx context.Context) error { 391 runnableJobsIter := jm.state.runnableJobIDs.NewIterator() 392 defer runnableJobsIter.Release() 393 394 for runnableJobsIter.Next() { 395 jobIDBytes := runnableJobsIter.Key() 396 jobID, err := ids.ToID(jobIDBytes) 397 if err != nil { 398 return fmt.Errorf("failed to convert jobID bytes into ID due to: %w", err) 399 } 400 401 job, err := jm.state.GetJob(ctx, jobID) 402 if err != nil { 403 return fmt.Errorf("failed to retrieve job on runnnable stack due to: %w", err) 404 } 405 deps, err := job.MissingDependencies(ctx) 406 if err != nil { 407 return fmt.Errorf("failed to retrieve missing dependencies of job on runnable stack due to: %w", err) 408 } 409 if deps.Len() == 0 { 410 continue 411 } 412 413 // If the job has missing dependencies, remove it from the runnable stack 414 if err := jm.state.runnableJobIDs.Delete(jobIDBytes); err != nil { 415 return fmt.Errorf("failed to delete jobID from runnable stack due to: %w", err) 416 } 417 418 // Add the missing dependencies to the set that needs to be fetched. 419 jm.AddMissingID(deps.List()...) 420 for depID := range deps { 421 if err := jm.state.AddDependency(depID, jobID); err != nil { 422 return fmt.Errorf("failed to add blocking for depID %s, jobID %s while cleaning the runnable stack", depID, jobID) 423 } 424 } 425 } 426 427 return errors.Join( 428 runnableJobsIter.Error(), 429 jm.Commit(), 430 ) 431 }