github.com/MetalBlockchain/metalgo@v1.11.9/snow/engine/avalanche/bootstrap/bootstrapper.go (about) 1 // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. 2 // See the file LICENSE for licensing terms. 3 4 package bootstrap 5 6 import ( 7 "context" 8 "fmt" 9 "time" 10 11 "github.com/prometheus/client_golang/prometheus" 12 "go.uber.org/zap" 13 14 "github.com/MetalBlockchain/metalgo/cache" 15 "github.com/MetalBlockchain/metalgo/ids" 16 "github.com/MetalBlockchain/metalgo/proto/pb/p2p" 17 "github.com/MetalBlockchain/metalgo/snow" 18 "github.com/MetalBlockchain/metalgo/snow/choices" 19 "github.com/MetalBlockchain/metalgo/snow/consensus/avalanche" 20 "github.com/MetalBlockchain/metalgo/snow/engine/common" 21 "github.com/MetalBlockchain/metalgo/utils/bimap" 22 "github.com/MetalBlockchain/metalgo/utils/heap" 23 "github.com/MetalBlockchain/metalgo/utils/set" 24 "github.com/MetalBlockchain/metalgo/version" 25 ) 26 27 const ( 28 // We cache processed vertices where height = c * stripeDistance for c = {1,2,3...} 29 // This forms a "stripe" of cached DAG vertices at height stripeDistance, 2*stripeDistance, etc. 30 // This helps to limit the number of repeated DAG traversals performed 31 stripeDistance = 2000 32 stripeWidth = 5 33 cacheSize = 100000 34 35 // statusUpdateFrequency is how many containers should be processed between 36 // logs 37 statusUpdateFrequency = 5000 38 39 // maxOutstandingGetAncestorsRequests is the maximum number of GetAncestors 40 // sent but not yet responded to/failed 41 maxOutstandingGetAncestorsRequests = 10 42 43 epsilon = 1e-6 // small amount to add to time to avoid division by 0 44 ) 45 46 var _ common.BootstrapableEngine = (*bootstrapper)(nil) 47 48 func New( 49 config Config, 50 onFinished func(ctx context.Context, lastReqID uint32) error, 51 reg prometheus.Registerer, 52 ) (common.BootstrapableEngine, error) { 53 b := &bootstrapper{ 54 Config: config, 55 56 StateSummaryFrontierHandler: common.NewNoOpStateSummaryFrontierHandler(config.Ctx.Log), 57 AcceptedStateSummaryHandler: common.NewNoOpAcceptedStateSummaryHandler(config.Ctx.Log), 58 AcceptedFrontierHandler: common.NewNoOpAcceptedFrontierHandler(config.Ctx.Log), 59 AcceptedHandler: common.NewNoOpAcceptedHandler(config.Ctx.Log), 60 PutHandler: common.NewNoOpPutHandler(config.Ctx.Log), 61 QueryHandler: common.NewNoOpQueryHandler(config.Ctx.Log), 62 ChitsHandler: common.NewNoOpChitsHandler(config.Ctx.Log), 63 AppHandler: config.VM, 64 65 outstandingRequests: bimap.New[common.Request, ids.ID](), 66 outstandingRequestTimes: make(map[common.Request]time.Time), 67 68 processedCache: &cache.LRU[ids.ID, struct{}]{Size: cacheSize}, 69 onFinished: onFinished, 70 } 71 return b, b.metrics.Initialize(reg) 72 } 73 74 // Note: To align with the Snowman invariant, it should be guaranteed the VM is 75 // not used until after the bootstrapper has been Started. 76 type bootstrapper struct { 77 Config 78 common.Halter 79 80 // list of NoOpsHandler for messages dropped by bootstrapper 81 common.StateSummaryFrontierHandler 82 common.AcceptedStateSummaryHandler 83 common.AcceptedFrontierHandler 84 common.AcceptedHandler 85 common.PutHandler 86 common.QueryHandler 87 common.ChitsHandler 88 common.AppHandler 89 90 metrics 91 92 // tracks which validators were asked for which containers in which requests 93 outstandingRequests *bimap.BiMap[common.Request, ids.ID] 94 outstandingRequestTimes map[common.Request]time.Time 95 96 // IDs of vertices that we will send a GetAncestors request for once we are 97 // not at the max number of outstanding requests 98 needToFetch set.Set[ids.ID] 99 100 // Contains IDs of vertices that have recently been processed 101 processedCache *cache.LRU[ids.ID, struct{}] 102 103 // Tracks the last requestID that was used in a request 104 requestID uint32 105 106 // Called when bootstrapping is done on a specific chain 107 onFinished func(ctx context.Context, lastReqID uint32) error 108 } 109 110 func (b *bootstrapper) Context() *snow.ConsensusContext { 111 return b.Ctx 112 } 113 114 func (b *bootstrapper) Clear(context.Context) error { 115 b.Ctx.Lock.Lock() 116 defer b.Ctx.Lock.Unlock() 117 118 if err := b.VtxBlocked.Clear(); err != nil { 119 return err 120 } 121 if err := b.TxBlocked.Clear(); err != nil { 122 return err 123 } 124 if err := b.VtxBlocked.Commit(); err != nil { 125 return err 126 } 127 return b.TxBlocked.Commit() 128 } 129 130 // Ancestors handles the receipt of multiple containers. Should be received in 131 // response to a GetAncestors message to [nodeID] with request ID [requestID]. 132 // Expects vtxs[0] to be the vertex requested in the corresponding GetAncestors. 133 func (b *bootstrapper) Ancestors(ctx context.Context, nodeID ids.NodeID, requestID uint32, vtxs [][]byte) error { 134 request := common.Request{ 135 NodeID: nodeID, 136 RequestID: requestID, 137 } 138 requestedVtxID, ok := b.outstandingRequests.DeleteKey(request) 139 if !ok { // this message isn't in response to a request we made 140 b.Ctx.Log.Debug("received unexpected Ancestors", 141 zap.Stringer("nodeID", nodeID), 142 zap.Uint32("requestID", requestID), 143 ) 144 return nil 145 } 146 requestTime := b.outstandingRequestTimes[request] 147 delete(b.outstandingRequestTimes, request) 148 149 lenVtxs := len(vtxs) 150 if lenVtxs == 0 { 151 b.Ctx.Log.Debug("Ancestors contains no vertices", 152 zap.Stringer("nodeID", nodeID), 153 zap.Uint32("requestID", requestID), 154 ) 155 156 b.PeerTracker.RegisterFailure(nodeID) 157 return b.fetch(ctx, requestedVtxID) 158 } 159 160 if lenVtxs > b.Config.AncestorsMaxContainersReceived { 161 vtxs = vtxs[:b.Config.AncestorsMaxContainersReceived] 162 163 b.Ctx.Log.Debug("ignoring containers in Ancestors", 164 zap.Stringer("nodeID", nodeID), 165 zap.Uint32("requestID", requestID), 166 zap.Int("numIgnored", lenVtxs-b.Config.AncestorsMaxContainersReceived), 167 ) 168 } 169 170 vtx, err := b.Manager.ParseVtx(ctx, vtxs[0]) 171 if err != nil { 172 b.Ctx.Log.Debug("failed to parse requested vertex", 173 zap.Stringer("nodeID", nodeID), 174 zap.Uint32("requestID", requestID), 175 zap.Stringer("vtxID", requestedVtxID), 176 zap.Error(err), 177 ) 178 179 b.PeerTracker.RegisterFailure(nodeID) 180 return b.fetch(ctx, requestedVtxID) 181 } 182 183 if actualID := vtx.ID(); actualID != requestedVtxID { 184 b.Ctx.Log.Debug("received incorrect vertex", 185 zap.Stringer("nodeID", nodeID), 186 zap.Uint32("requestID", requestID), 187 zap.Stringer("vtxID", actualID), 188 ) 189 190 b.PeerTracker.RegisterFailure(nodeID) 191 return b.fetch(ctx, requestedVtxID) 192 } 193 194 b.needToFetch.Remove(requestedVtxID) 195 196 // All vertices added to [verticesToProcess] have received transitive votes 197 // from the accepted frontier. 198 var ( 199 numBytes = len(vtxs[0]) 200 verticesToProcess = make([]avalanche.Vertex, 1, len(vtxs)) 201 ) 202 verticesToProcess[0] = vtx 203 204 parents, err := vtx.Parents() 205 if err != nil { 206 return err 207 } 208 eligibleVertices := set.NewSet[ids.ID](len(parents)) 209 for _, parent := range parents { 210 eligibleVertices.Add(parent.ID()) 211 } 212 213 for _, vtxBytes := range vtxs[1:] { 214 vtx, err := b.Manager.ParseVtx(ctx, vtxBytes) // Persists the vtx 215 if err != nil { 216 b.Ctx.Log.Debug("failed to parse vertex", 217 zap.Stringer("nodeID", nodeID), 218 zap.Uint32("requestID", requestID), 219 zap.Error(err), 220 ) 221 break 222 } 223 vtxID := vtx.ID() 224 if !eligibleVertices.Contains(vtxID) { 225 b.Ctx.Log.Debug("received vertex that should not have been included", 226 zap.Stringer("nodeID", nodeID), 227 zap.Uint32("requestID", requestID), 228 zap.Stringer("vtxID", vtxID), 229 ) 230 break 231 } 232 eligibleVertices.Remove(vtxID) 233 parents, err := vtx.Parents() 234 if err != nil { 235 return err 236 } 237 for _, parent := range parents { 238 eligibleVertices.Add(parent.ID()) 239 } 240 241 numBytes += len(vtxBytes) 242 verticesToProcess = append(verticesToProcess, vtx) 243 b.needToFetch.Remove(vtxID) // No need to fetch this vertex since we have it now 244 } 245 246 // TODO: Calculate bandwidth based on the vertices that were persisted to 247 // disk. 248 var ( 249 requestLatency = time.Since(requestTime).Seconds() + epsilon 250 bandwidth = float64(numBytes) / requestLatency 251 ) 252 b.PeerTracker.RegisterResponse(nodeID, bandwidth) 253 254 return b.process(ctx, verticesToProcess...) 255 } 256 257 func (b *bootstrapper) GetAncestorsFailed(ctx context.Context, nodeID ids.NodeID, requestID uint32) error { 258 request := common.Request{ 259 NodeID: nodeID, 260 RequestID: requestID, 261 } 262 vtxID, ok := b.outstandingRequests.DeleteKey(request) 263 if !ok { 264 b.Ctx.Log.Debug("unexpectedly called GetAncestorsFailed", 265 zap.Stringer("nodeID", nodeID), 266 zap.Uint32("requestID", requestID), 267 ) 268 return nil 269 } 270 delete(b.outstandingRequestTimes, request) 271 272 // This node timed out their request. 273 b.PeerTracker.RegisterFailure(nodeID) 274 275 // Send another request for the vertex 276 return b.fetch(ctx, vtxID) 277 } 278 279 func (b *bootstrapper) Connected( 280 ctx context.Context, 281 nodeID ids.NodeID, 282 nodeVersion *version.Application, 283 ) error { 284 if err := b.VM.Connected(ctx, nodeID, nodeVersion); err != nil { 285 return err 286 } 287 288 return b.StartupTracker.Connected(ctx, nodeID, nodeVersion) 289 } 290 291 func (b *bootstrapper) Disconnected(ctx context.Context, nodeID ids.NodeID) error { 292 if err := b.VM.Disconnected(ctx, nodeID); err != nil { 293 return err 294 } 295 296 return b.StartupTracker.Disconnected(ctx, nodeID) 297 } 298 299 func (*bootstrapper) Timeout(context.Context) error { 300 return nil 301 } 302 303 func (*bootstrapper) Gossip(context.Context) error { 304 return nil 305 } 306 307 func (b *bootstrapper) Shutdown(ctx context.Context) error { 308 b.Ctx.Log.Info("shutting down bootstrapper") 309 310 b.Ctx.Lock.Lock() 311 defer b.Ctx.Lock.Unlock() 312 313 return b.VM.Shutdown(ctx) 314 } 315 316 func (*bootstrapper) Notify(context.Context, common.Message) error { 317 return nil 318 } 319 320 func (b *bootstrapper) Start(ctx context.Context, startReqID uint32) error { 321 b.Ctx.Log.Info("starting bootstrap") 322 323 b.Ctx.State.Set(snow.EngineState{ 324 Type: p2p.EngineType_ENGINE_TYPE_AVALANCHE, 325 State: snow.Bootstrapping, 326 }) 327 if err := b.VM.SetState(ctx, snow.Bootstrapping); err != nil { 328 return fmt.Errorf("failed to notify VM that bootstrapping has started: %w", 329 err) 330 } 331 332 if err := b.VtxBlocked.SetParser(ctx, &vtxParser{ 333 log: b.Ctx.Log, 334 numAccepted: b.numAcceptedVts, 335 manager: b.Manager, 336 }); err != nil { 337 return err 338 } 339 340 if err := b.TxBlocked.SetParser(&txParser{ 341 log: b.Ctx.Log, 342 numAccepted: b.numAcceptedTxs, 343 vm: b.VM, 344 }); err != nil { 345 return err 346 } 347 348 b.requestID = startReqID 349 350 // If the network was already linearized, don't attempt to linearize it 351 // again. 352 linearized, err := b.Manager.StopVertexAccepted(ctx) 353 if err != nil { 354 return fmt.Errorf("failed to get linearization status: %w", err) 355 } 356 if linearized { 357 return b.startSyncing(ctx, nil) 358 } 359 360 // If a stop vertex is well known, accept that. 361 if b.Config.StopVertexID != ids.Empty { 362 b.Ctx.Log.Info("using well known stop vertex", 363 zap.Stringer("vtxID", b.Config.StopVertexID), 364 ) 365 366 return b.startSyncing(ctx, []ids.ID{b.Config.StopVertexID}) 367 } 368 369 // If a stop vertex isn't well known, treat the current state as the final 370 // DAG state. 371 // 372 // Note: This is used to linearize networks that were created after the 373 // linearization occurred. 374 edge := b.Manager.Edge(ctx) 375 stopVertex, err := b.Manager.BuildStopVtx(ctx, edge) 376 if err != nil { 377 return fmt.Errorf("failed to create stop vertex: %w", err) 378 } 379 if err := stopVertex.Accept(ctx); err != nil { 380 return fmt.Errorf("failed to accept stop vertex: %w", err) 381 } 382 383 stopVertexID := stopVertex.ID() 384 b.Ctx.Log.Info("generated stop vertex", 385 zap.Stringer("vtxID", stopVertexID), 386 ) 387 388 return b.startSyncing(ctx, nil) 389 } 390 391 func (b *bootstrapper) HealthCheck(ctx context.Context) (interface{}, error) { 392 b.Ctx.Lock.Lock() 393 defer b.Ctx.Lock.Unlock() 394 395 vmIntf, vmErr := b.VM.HealthCheck(ctx) 396 intf := map[string]interface{}{ 397 "consensus": struct{}{}, 398 "vm": vmIntf, 399 } 400 return intf, vmErr 401 } 402 403 // Add the vertices in [vtxIDs] to the set of vertices that we need to fetch, 404 // and then fetch vertices (and their ancestors) until either there are no more 405 // to fetch or we are at the maximum number of outstanding requests. 406 func (b *bootstrapper) fetch(ctx context.Context, vtxIDs ...ids.ID) error { 407 b.needToFetch.Add(vtxIDs...) 408 for b.needToFetch.Len() > 0 && b.outstandingRequests.Len() < maxOutstandingGetAncestorsRequests { 409 vtxID, _ := b.needToFetch.Pop() // Length checked in predicate above 410 411 // Make sure we haven't already requested this vertex 412 if b.outstandingRequests.HasValue(vtxID) { 413 continue 414 } 415 416 // Make sure we don't already have this vertex 417 if _, err := b.Manager.GetVtx(ctx, vtxID); err == nil { 418 continue 419 } 420 421 nodeID, ok := b.PeerTracker.SelectPeer() 422 if !ok { 423 // If we aren't connected to any peers, we send a request to ourself 424 // which is guaranteed to fail. We send this message to use the 425 // message timeout as a retry mechanism. Once we are connected to 426 // another node again we will select them to sample from. 427 nodeID = b.Ctx.NodeID 428 } 429 430 b.PeerTracker.RegisterRequest(nodeID) 431 432 b.requestID++ 433 request := common.Request{ 434 NodeID: nodeID, 435 RequestID: b.requestID, 436 } 437 b.outstandingRequests.Put(request, vtxID) 438 b.outstandingRequestTimes[request] = time.Now() 439 b.Config.Sender.SendGetAncestors(ctx, nodeID, b.requestID, vtxID) // request vertex and ancestors 440 } 441 return b.checkFinish(ctx) 442 } 443 444 // Process the vertices in [vtxs]. 445 func (b *bootstrapper) process(ctx context.Context, vtxs ...avalanche.Vertex) error { 446 // Vertices that we need to process prioritized by vertices that are unknown 447 // or the furthest down the DAG. Unknown vertices are prioritized to ensure 448 // that once we have made it below a certain height in DAG traversal we do 449 // not need to reset and repeat DAG traversals. 450 toProcess := heap.NewMap[ids.ID, avalanche.Vertex](vertexLess) 451 for _, vtx := range vtxs { 452 vtxID := vtx.ID() 453 if _, ok := b.processedCache.Get(vtxID); !ok { // only process a vertex if we haven't already 454 _, _ = toProcess.Push(vtxID, vtx) 455 } else { 456 b.VtxBlocked.RemoveMissingID(vtxID) 457 } 458 } 459 460 vtxHeightSet := set.Set[ids.ID]{} 461 prevHeight := uint64(0) 462 463 for { 464 if b.Halted() { 465 return nil 466 } 467 468 vtxID, vtx, ok := toProcess.Pop() 469 if !ok { 470 break 471 } 472 473 switch vtx.Status() { 474 case choices.Unknown: 475 b.VtxBlocked.AddMissingID(vtxID) 476 b.needToFetch.Add(vtxID) // We don't have this vertex locally. Mark that we need to fetch it. 477 case choices.Rejected: 478 return fmt.Errorf("tried to accept %s even though it was previously rejected", vtxID) 479 case choices.Processing: 480 b.needToFetch.Remove(vtxID) 481 b.VtxBlocked.RemoveMissingID(vtxID) 482 483 // Add to queue of vertices to execute when bootstrapping finishes. 484 pushed, err := b.VtxBlocked.Push(ctx, &vertexJob{ 485 log: b.Ctx.Log, 486 numAccepted: b.numAcceptedVts, 487 vtx: vtx, 488 }) 489 if err != nil { 490 return err 491 } 492 if !pushed { 493 // If the vertex is already on the queue, then we have already 494 // pushed [vtx]'s transactions and traversed into its parents. 495 continue 496 } 497 498 txs, err := vtx.Txs(ctx) 499 if err != nil { 500 return err 501 } 502 503 for _, tx := range txs { 504 // Add to queue of txs to execute when bootstrapping finishes. 505 pushed, err := b.TxBlocked.Push(ctx, &txJob{ 506 log: b.Ctx.Log, 507 numAccepted: b.numAcceptedTxs, 508 tx: tx, 509 }) 510 if err != nil { 511 return err 512 } 513 if pushed { 514 b.numFetchedTxs.Inc() 515 } 516 } 517 518 b.numFetchedVts.Inc() 519 520 verticesFetchedSoFar := b.VtxBlocked.Jobs.PendingJobs() 521 if verticesFetchedSoFar%statusUpdateFrequency == 0 { // Periodically print progress 522 b.Ctx.Log.Info("fetched vertices", 523 zap.Uint64("numVerticesFetched", verticesFetchedSoFar), 524 ) 525 } 526 527 parents, err := vtx.Parents() 528 if err != nil { 529 return err 530 } 531 for _, parent := range parents { // Process the parents of this vertex (traverse up the DAG) 532 parentID := parent.ID() 533 if _, ok := b.processedCache.Get(parentID); !ok { // But only if we haven't processed the parent 534 if !vtxHeightSet.Contains(parentID) { 535 toProcess.Push(parentID, parent) 536 } 537 } 538 } 539 height, err := vtx.Height() 540 if err != nil { 541 return err 542 } 543 if height%stripeDistance < stripeWidth { // See comment for stripeDistance 544 b.processedCache.Put(vtxID, struct{}{}) 545 } 546 if height == prevHeight { 547 vtxHeightSet.Add(vtxID) 548 } else { 549 // Set new height and reset [vtxHeightSet] 550 prevHeight = height 551 vtxHeightSet.Clear() 552 vtxHeightSet.Add(vtxID) 553 } 554 } 555 } 556 557 if err := b.TxBlocked.Commit(); err != nil { 558 return err 559 } 560 if err := b.VtxBlocked.Commit(); err != nil { 561 return err 562 } 563 564 return b.fetch(ctx) 565 } 566 567 // startSyncing starts bootstrapping. Process the vertices in [accepterContainerIDs]. 568 func (b *bootstrapper) startSyncing(ctx context.Context, acceptedContainerIDs []ids.ID) error { 569 pendingContainerIDs := b.VtxBlocked.MissingIDs() 570 // Append the list of accepted container IDs to pendingContainerIDs to ensure 571 // we iterate over every container that must be traversed. 572 pendingContainerIDs = append(pendingContainerIDs, acceptedContainerIDs...) 573 b.Ctx.Log.Debug("starting bootstrapping", 574 zap.Int("numMissingVertices", len(pendingContainerIDs)), 575 zap.Int("numAcceptedVertices", len(acceptedContainerIDs)), 576 ) 577 toProcess := make([]avalanche.Vertex, 0, len(pendingContainerIDs)) 578 for _, vtxID := range pendingContainerIDs { 579 if vtx, err := b.Manager.GetVtx(ctx, vtxID); err == nil { 580 if vtx.Status() == choices.Accepted { 581 b.VtxBlocked.RemoveMissingID(vtxID) 582 } else { 583 toProcess = append(toProcess, vtx) // Process this vertex. 584 } 585 } else { 586 b.VtxBlocked.AddMissingID(vtxID) 587 b.needToFetch.Add(vtxID) // We don't have this vertex. Mark that we have to fetch it. 588 } 589 } 590 return b.process(ctx, toProcess...) 591 } 592 593 // checkFinish repeatedly executes pending transactions and requests new frontier blocks until there aren't any new ones 594 // after which it finishes the bootstrap process 595 func (b *bootstrapper) checkFinish(ctx context.Context) error { 596 // If we still need to fetch vertices, we can't finish 597 if len(b.VtxBlocked.MissingIDs()) > 0 { 598 return nil 599 } 600 601 b.Ctx.Log.Info("executing transactions") 602 _, err := b.TxBlocked.ExecuteAll( 603 ctx, 604 b.Config.Ctx, 605 b, 606 false, 607 b.Ctx.TxAcceptor, 608 ) 609 if err != nil || b.Halted() { 610 return err 611 } 612 613 b.Ctx.Log.Info("executing vertices") 614 _, err = b.VtxBlocked.ExecuteAll( 615 ctx, 616 b.Config.Ctx, 617 b, 618 false, 619 b.Ctx.VertexAcceptor, 620 ) 621 if err != nil || b.Halted() { 622 return err 623 } 624 625 // Invariant: edge will only be the stop vertex 626 edge := b.Manager.Edge(ctx) 627 stopVertexID := edge[0] 628 if err := b.VM.Linearize(ctx, stopVertexID); err != nil { 629 return err 630 } 631 632 b.processedCache.Flush() 633 return b.onFinished(ctx, b.requestID) 634 } 635 636 // A vertex is less than another vertex if it is unknown. Ties are broken by 637 // prioritizing vertices that have a greater height. 638 func vertexLess(i, j avalanche.Vertex) bool { 639 if !i.Status().Fetched() { 640 return true 641 } 642 if !j.Status().Fetched() { 643 return false 644 } 645 646 // Treat errors on retrieving the height as if the vertex is not fetched 647 heightI, errI := i.Height() 648 if errI != nil { 649 return true 650 } 651 heightJ, errJ := j.Height() 652 if errJ != nil { 653 return false 654 } 655 656 return heightI > heightJ 657 }