github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/common/requester/engine.go (about)

     1  package requester
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"time"
     7  
     8  	"github.com/rs/zerolog"
     9  	"github.com/vmihailenco/msgpack"
    10  	"go.uber.org/atomic"
    11  
    12  	"github.com/onflow/flow-go/engine"
    13  	"github.com/onflow/flow-go/model/flow"
    14  	"github.com/onflow/flow-go/model/flow/filter"
    15  	"github.com/onflow/flow-go/model/messages"
    16  	"github.com/onflow/flow-go/module"
    17  	"github.com/onflow/flow-go/module/metrics"
    18  	"github.com/onflow/flow-go/network"
    19  	"github.com/onflow/flow-go/network/channels"
    20  	"github.com/onflow/flow-go/state/protocol"
    21  	"github.com/onflow/flow-go/utils/logging"
    22  	"github.com/onflow/flow-go/utils/rand"
    23  )
    24  
    25  // HandleFunc is a function provided to the requester engine to handle an entity
    26  // once it has been retrieved from a provider. The function should be non-blocking
    27  // and errors should be handled internally within the function.
    28  type HandleFunc func(originID flow.Identifier, entity flow.Entity)
    29  
    30  // CreateFunc is a function that creates a `flow.Entity` with an underlying type
    31  // so that we can properly decode entities transmitted over the network.
    32  type CreateFunc func() flow.Entity
    33  
    34  // Engine is a generic requester engine, handling the requesting of entities
    35  // on the flow network. It is the `request` part of the request-reply
    36  // pattern provided by the pair of generic exchange engines.
    37  type Engine struct {
    38  	unit     *engine.Unit
    39  	log      zerolog.Logger
    40  	cfg      Config
    41  	metrics  module.EngineMetrics
    42  	me       module.Local
    43  	state    protocol.State
    44  	con      network.Conduit
    45  	channel  channels.Channel
    46  	selector flow.IdentityFilter[flow.Identity]
    47  	create   CreateFunc
    48  	handle   HandleFunc
    49  
    50  	// changing the following state variables must be guarded by unit.Lock()
    51  	items                 map[flow.Identifier]*Item
    52  	requests              map[uint64]*messages.EntityRequest
    53  	forcedDispatchOngoing *atomic.Bool // to ensure only trigger dispatching logic once at any time
    54  }
    55  
    56  // New creates a new requester engine, operating on the provided network channel, and requesting entities from a node
    57  // within the set obtained by applying the provided selector filter. The options allow customization of the parameters
    58  // related to the batch and retry logic.
    59  func New(log zerolog.Logger, metrics module.EngineMetrics, net network.EngineRegistry, me module.Local, state protocol.State,
    60  	channel channels.Channel, selector flow.IdentityFilter[flow.Identity], create CreateFunc, options ...OptionFunc) (*Engine, error) {
    61  
    62  	// initialize the default config
    63  	cfg := Config{
    64  		BatchThreshold:  32,
    65  		BatchInterval:   time.Second,
    66  		RetryInitial:    4 * time.Second,
    67  		RetryFunction:   RetryGeometric(2),
    68  		RetryMaximum:    2 * time.Minute,
    69  		RetryAttempts:   math.MaxUint32,
    70  		ValidateStaking: true,
    71  	}
    72  
    73  	// apply the custom option parameters
    74  	for _, option := range options {
    75  		option(&cfg)
    76  	}
    77  
    78  	// check validity of retry function
    79  	interval := cfg.RetryFunction(time.Second)
    80  	if interval < time.Second {
    81  		return nil, fmt.Errorf("invalid retry function (interval must always increase)")
    82  	}
    83  
    84  	// check validity of maximum interval
    85  	if cfg.RetryMaximum < cfg.RetryInitial {
    86  		return nil, fmt.Errorf("invalid retry maximum (must not be smaller than initial interval)")
    87  	}
    88  
    89  	// make sure we don't send requests from self
    90  	selector = filter.And(
    91  		selector,
    92  		filter.Not(filter.HasNodeID[flow.Identity](me.NodeID())),
    93  		filter.Not(filter.HasParticipationStatus(flow.EpochParticipationStatusEjected)),
    94  	)
    95  
    96  	// make sure we only send requests to nodes that are active in the current epoch and have positive weight
    97  	if cfg.ValidateStaking {
    98  		selector = filter.And(
    99  			selector,
   100  			filter.HasInitialWeight[flow.Identity](true),
   101  			filter.HasParticipationStatus(flow.EpochParticipationStatusActive),
   102  		)
   103  	}
   104  
   105  	// initialize the propagation engine with its dependencies
   106  	e := &Engine{
   107  		unit:                  engine.NewUnit(),
   108  		log:                   log.With().Str("engine", "requester").Logger(),
   109  		cfg:                   cfg,
   110  		metrics:               metrics,
   111  		me:                    me,
   112  		state:                 state,
   113  		channel:               channel,
   114  		selector:              selector,
   115  		create:                create,
   116  		handle:                nil,
   117  		items:                 make(map[flow.Identifier]*Item),          // holds all pending items
   118  		requests:              make(map[uint64]*messages.EntityRequest), // holds all sent requests
   119  		forcedDispatchOngoing: atomic.NewBool(false),
   120  	}
   121  
   122  	// register the engine with the network layer and store the conduit
   123  	con, err := net.Register(channels.Channel(channel), e)
   124  	if err != nil {
   125  		return nil, fmt.Errorf("could not register engine: %w", err)
   126  	}
   127  	e.con = con
   128  
   129  	return e, nil
   130  }
   131  
   132  // WithHandle sets the handle function of the requester, which is how it processes
   133  // returned entities. The engine can not be started without setting the handle
   134  // function. It is done in a separate call so that the requester can be injected
   135  // into engines upon construction, and then provide a handle function to the
   136  // requester from that engine itself.
   137  func (e *Engine) WithHandle(handle HandleFunc) {
   138  	e.handle = handle
   139  }
   140  
   141  // Ready returns a ready channel that is closed once the engine has fully
   142  // started. For consensus engine, this is true once the underlying consensus
   143  // algorithm has started.
   144  func (e *Engine) Ready() <-chan struct{} {
   145  	if e.handle == nil {
   146  		panic("must initialize requester engine with handler")
   147  	}
   148  	e.unit.Launch(e.poll)
   149  	return e.unit.Ready()
   150  }
   151  
   152  // Done returns a done channel that is closed once the engine has fully stopped.
   153  // For the consensus engine, we wait for hotstuff to finish.
   154  func (e *Engine) Done() <-chan struct{} {
   155  	return e.unit.Done()
   156  }
   157  
   158  // SubmitLocal submits an message originating on the local node.
   159  func (e *Engine) SubmitLocal(message interface{}) {
   160  	e.unit.Launch(func() {
   161  		err := e.process(e.me.NodeID(), message)
   162  		if err != nil {
   163  			engine.LogError(e.log, err)
   164  		}
   165  	})
   166  }
   167  
   168  // Submit submits the given message from the node with the given origin ID
   169  // for processing in a non-blocking manner. It returns instantly and logs
   170  // a potential processing error internally when done.
   171  func (e *Engine) Submit(channel channels.Channel, originID flow.Identifier, message interface{}) {
   172  	e.unit.Launch(func() {
   173  		err := e.Process(channel, originID, message)
   174  		if err != nil {
   175  			engine.LogError(e.log, err)
   176  		}
   177  	})
   178  }
   179  
   180  // ProcessLocal processes an message originating on the local node.
   181  func (e *Engine) ProcessLocal(message interface{}) error {
   182  	return e.unit.Do(func() error {
   183  		return e.process(e.me.NodeID(), message)
   184  	})
   185  }
   186  
   187  // Process processes the given message from the node with the given origin ID in
   188  // a blocking manner. It returns the potential processing error when done.
   189  func (e *Engine) Process(channel channels.Channel, originID flow.Identifier, message interface{}) error {
   190  	return e.unit.Do(func() error {
   191  		return e.process(originID, message)
   192  	})
   193  }
   194  
   195  // EntityByID adds an entity to the list of entities to be requested from the
   196  // provider. It is idempotent, meaning that adding the same entity to the
   197  // requester engine multiple times has no effect, unless the item has
   198  // expired due to too many requests and has thus been deleted from the
   199  // list. The provided selector will be applied to the set of valid providers on top
   200  // of the global selector injected upon construction. It allows for finer-grained
   201  // control over which subset of providers to request a given entity from, such as
   202  // selection of a collection cluster. Use `filter.Any` if no additional selection
   203  // is required. Checks integrity of response to make sure that we got entity that we were requesting.
   204  func (e *Engine) EntityByID(entityID flow.Identifier, selector flow.IdentityFilter[flow.Identity]) {
   205  	e.addEntityRequest(entityID, selector, true)
   206  }
   207  
   208  // Query will request data through the request engine backing the interface.
   209  // The additional selector will be applied to the subset
   210  // of valid providers for the data and allows finer-grained control
   211  // over which providers to request data from. Doesn't perform integrity check
   212  // can be used to get entities without knowing their ID.
   213  func (e *Engine) Query(key flow.Identifier, selector flow.IdentityFilter[flow.Identity]) {
   214  	e.addEntityRequest(key, selector, false)
   215  }
   216  
   217  func (e *Engine) addEntityRequest(entityID flow.Identifier, selector flow.IdentityFilter[flow.Identity], checkIntegrity bool) {
   218  	e.unit.Lock()
   219  	defer e.unit.Unlock()
   220  
   221  	// check if we already have an item for this entity
   222  	_, duplicate := e.items[entityID]
   223  	if duplicate {
   224  		return
   225  	}
   226  
   227  	// otherwise, add a new item to the list
   228  	item := &Item{
   229  		EntityID:       entityID,
   230  		NumAttempts:    0,
   231  		LastRequested:  time.Time{},
   232  		RetryAfter:     e.cfg.RetryInitial,
   233  		ExtraSelector:  selector,
   234  		checkIntegrity: checkIntegrity,
   235  	}
   236  	e.items[entityID] = item
   237  }
   238  
   239  // Force will force the requester engine to dispatch all currently
   240  // valid batch requests.
   241  func (e *Engine) Force() {
   242  	// exit early in case a forced dispatch is currently ongoing
   243  	if e.forcedDispatchOngoing.Load() {
   244  		return
   245  	}
   246  
   247  	// using Launch to ensure the caller won't be blocked
   248  	e.unit.Launch(func() {
   249  		// using atomic bool to ensure there is at most one caller would trigger dispatching requests
   250  		if e.forcedDispatchOngoing.CompareAndSwap(false, true) {
   251  			count := uint(0)
   252  			for {
   253  				dispatched, err := e.dispatchRequest()
   254  				if err != nil {
   255  					e.log.Error().Err(err).Msg("could not dispatch requests")
   256  					break
   257  				}
   258  				if !dispatched {
   259  					e.log.Debug().Uint("requests", count).Msg("forced request dispatch")
   260  					break
   261  				}
   262  				count++
   263  			}
   264  			e.forcedDispatchOngoing.Store(false)
   265  		}
   266  	})
   267  }
   268  
   269  func (e *Engine) poll() {
   270  	ticker := time.NewTicker(e.cfg.BatchInterval)
   271  
   272  PollLoop:
   273  	for {
   274  		select {
   275  		case <-e.unit.Quit():
   276  			break PollLoop
   277  
   278  		case <-ticker.C:
   279  			if e.forcedDispatchOngoing.Load() {
   280  				return
   281  			}
   282  
   283  			dispatched, err := e.dispatchRequest()
   284  			if err != nil {
   285  				e.log.Error().Err(err).Msg("could not dispatch requests")
   286  				continue PollLoop
   287  			}
   288  			if dispatched {
   289  				e.log.Debug().Uint("requests", 1).Msg("regular request dispatch")
   290  			}
   291  		}
   292  	}
   293  
   294  	ticker.Stop()
   295  }
   296  
   297  // dispatchRequest dispatches a subset of requests (selection based on internal heuristic).
   298  // While `dispatchRequest` sends a request (covering some but not necessarily all items),
   299  // if and only if there is something to request. In other words it cannot happen that
   300  // `dispatchRequest` sends no request, but there is something to be requested.
   301  // The boolean return value indicates whether a request was dispatched at all.
   302  func (e *Engine) dispatchRequest() (bool, error) {
   303  
   304  	e.unit.Lock()
   305  	defer e.unit.Unlock()
   306  
   307  	e.log.Debug().Int("num_entities", len(e.items)).Msg("selecting entities")
   308  
   309  	// get the current top-level set of valid providers
   310  	providers, err := e.state.Final().Identities(e.selector)
   311  	if err != nil {
   312  		return false, fmt.Errorf("could not get providers: %w", err)
   313  	}
   314  
   315  	// randomize order of items, so that they can be requested in different order each time
   316  	rndItems := make([]flow.Identifier, 0, len(e.items))
   317  	for k := range e.items {
   318  		rndItems = append(rndItems, e.items[k].EntityID)
   319  	}
   320  	err = rand.Shuffle(uint(len(rndItems)), func(i, j uint) {
   321  		rndItems[i], rndItems[j] = rndItems[j], rndItems[i]
   322  	})
   323  	if err != nil {
   324  		return false, fmt.Errorf("shuffle failed: %w", err)
   325  	}
   326  
   327  	// go through each item and decide if it should be requested again
   328  	now := time.Now().UTC()
   329  	var providerID flow.Identifier
   330  	var entityIDs []flow.Identifier
   331  	for _, entityID := range rndItems {
   332  		item := e.items[entityID]
   333  
   334  		// if the item should not be requested yet, ignore
   335  		cutoff := item.LastRequested.Add(item.RetryAfter)
   336  		if cutoff.After(now) {
   337  			continue
   338  		}
   339  
   340  		// if the item reached maximum amount of retries, drop
   341  		if item.NumAttempts >= e.cfg.RetryAttempts {
   342  			e.log.Debug().Str("entity_id", entityID.String()).Msg("dropping entity ID max amount of retries reached")
   343  			delete(e.items, entityID)
   344  			continue
   345  		}
   346  
   347  		// if the provider has already been chosen, check if this item
   348  		// can be requested from the same provider; otherwise skip it
   349  		// for now, so it will be part of the next batch request
   350  		if providerID != flow.ZeroID {
   351  			overlap := providers.Filter(filter.And(
   352  				filter.HasNodeID[flow.Identity](providerID),
   353  				item.ExtraSelector,
   354  			))
   355  			if len(overlap) == 0 {
   356  				continue
   357  			}
   358  		}
   359  
   360  		// if no provider has been chosen yet, choose from restricted set
   361  		// NOTE: a single item can not permanently block requests going
   362  		// out when no providers are available for it, because the iteration
   363  		// order is random and will skip the item most of the times
   364  		// when other items are available
   365  		if providerID == flow.ZeroID {
   366  			providers = providers.Filter(item.ExtraSelector)
   367  			if len(providers) == 0 {
   368  				return false, fmt.Errorf("no valid providers available")
   369  			}
   370  			id, err := providers.Sample(1)
   371  			if err != nil {
   372  				return false, fmt.Errorf("sampling failed: %w", err)
   373  			}
   374  			providerID = id[0].NodeID
   375  		}
   376  
   377  		// add item to list and set retry parameters
   378  		// NOTE: we add the retry interval to the last requested timestamp,
   379  		// rather than using the current timestamp, in order to conserve a
   380  		// more even distribution of timestamps over time, which should lead
   381  		// to a more even distribution of entities over batch requests
   382  		entityIDs = append(entityIDs, entityID)
   383  		item.NumAttempts++
   384  		item.LastRequested = now
   385  		item.RetryAfter = e.cfg.RetryFunction(item.RetryAfter)
   386  
   387  		// make sure the interval is within parameters
   388  		if item.RetryAfter < e.cfg.RetryInitial {
   389  			item.RetryAfter = e.cfg.RetryInitial
   390  		}
   391  		if item.RetryAfter > e.cfg.RetryMaximum {
   392  			item.RetryAfter = e.cfg.RetryMaximum
   393  		}
   394  
   395  		// if we reached the maximum size for a batch, bail
   396  		if uint(len(entityIDs)) >= e.cfg.BatchThreshold {
   397  			break
   398  		}
   399  	}
   400  
   401  	// if there are no items to request, return
   402  	if len(entityIDs) == 0 {
   403  		return false, nil
   404  	}
   405  
   406  	nonce, err := rand.Uint64()
   407  	if err != nil {
   408  		return false, fmt.Errorf("nonce generation failed: %w", err)
   409  	}
   410  
   411  	// create a batch request, send it and store it for reference
   412  	req := &messages.EntityRequest{
   413  		Nonce:     nonce,
   414  		EntityIDs: entityIDs,
   415  	}
   416  
   417  	requestStart := time.Now()
   418  
   419  	if e.log.Debug().Enabled() {
   420  		e.log.Debug().
   421  			Hex("provider", logging.ID(providerID)).
   422  			Uint64("nonce", req.Nonce).
   423  			Int("num_selected", len(entityIDs)).
   424  			Strs("entities", logging.IDs(entityIDs)).
   425  			Msg("sending entity request")
   426  	}
   427  
   428  	err = e.con.Unicast(req, providerID)
   429  	if err != nil {
   430  		return true, fmt.Errorf("could not send request for entities %v: %w", logging.IDs(entityIDs), err)
   431  	}
   432  	e.requests[req.Nonce] = req
   433  
   434  	if e.log.Debug().Enabled() {
   435  		e.log.Debug().
   436  			Hex("provider", logging.ID(providerID)).
   437  			Uint64("nonce", req.Nonce).
   438  			Strs("entities", logging.IDs(entityIDs)).
   439  			TimeDiff("duration", time.Now(), requestStart).
   440  			Msg("entity request sent")
   441  	}
   442  
   443  	// NOTE: we forget about requests after the expiry of the shortest retry time
   444  	// from the entities in the list; this means that we purge requests aggressively.
   445  	// However, most requests should be responded to on the first attempt and clearing
   446  	// these up only removes the ability to instantly retry upon partial responses, so
   447  	// it won't affect much.
   448  	go func() {
   449  		<-time.After(e.cfg.RetryInitial)
   450  
   451  		e.unit.Lock()
   452  		defer e.unit.Unlock()
   453  		delete(e.requests, req.Nonce)
   454  	}()
   455  
   456  	e.metrics.MessageSent(e.channel.String(), metrics.MessageEntityRequest)
   457  	e.log.Debug().
   458  		Uint64("nonce", req.Nonce).
   459  		Strs("entity_ids", flow.IdentifierList(req.EntityIDs).Strings()).
   460  		Msg("entity request sent")
   461  
   462  	return true, nil
   463  }
   464  
   465  // process processes events for the propagation engine on the consensus node.
   466  func (e *Engine) process(originID flow.Identifier, message interface{}) error {
   467  
   468  	e.metrics.MessageReceived(e.channel.String(), metrics.MessageEntityResponse)
   469  	defer e.metrics.MessageHandled(e.channel.String(), metrics.MessageEntityResponse)
   470  
   471  	switch msg := message.(type) {
   472  	case *messages.EntityResponse:
   473  		return e.onEntityResponse(originID, msg)
   474  	default:
   475  		return engine.NewInvalidInputErrorf("invalid message type (%T)", message)
   476  	}
   477  }
   478  
   479  func (e *Engine) onEntityResponse(originID flow.Identifier, res *messages.EntityResponse) error {
   480  	lg := e.log.With().Str("origin_id", originID.String()).Uint64("nonce", res.Nonce).Logger()
   481  
   482  	lg.Debug().Strs("entity_ids", flow.IdentifierList(res.EntityIDs).Strings()).Msg("entity response received")
   483  
   484  	if e.cfg.ValidateStaking {
   485  
   486  		// check that the response comes from a valid provider
   487  		providers, err := e.state.Final().Identities(filter.And(
   488  			e.selector,
   489  			filter.HasNodeID[flow.Identity](originID),
   490  		))
   491  		if err != nil {
   492  			return fmt.Errorf("could not get providers: %w", err)
   493  		}
   494  		if len(providers) == 0 {
   495  			return engine.NewInvalidInputErrorf("invalid provider origin (%x)", originID)
   496  		}
   497  	}
   498  
   499  	if e.log.Debug().Enabled() {
   500  		e.log.Debug().
   501  			Hex("provider", logging.ID(originID)).
   502  			Strs("entities", logging.IDs(res.EntityIDs)).
   503  			Uint64("nonce", res.Nonce).
   504  			Msg("onEntityResponse entries received")
   505  	}
   506  
   507  	e.unit.Lock()
   508  	defer e.unit.Unlock()
   509  
   510  	// build a list of needed entities; if not available, process anyway,
   511  	// but in that case we can't re-queue missing items
   512  	needed := make(map[flow.Identifier]struct{})
   513  	req, exists := e.requests[res.Nonce]
   514  	if exists {
   515  		delete(e.requests, req.Nonce)
   516  		for _, entityID := range req.EntityIDs {
   517  			needed[entityID] = struct{}{}
   518  		}
   519  	}
   520  
   521  	// ensure the response is correctly formed
   522  	if len(res.Blobs) != len(res.EntityIDs) {
   523  		return engine.NewInvalidInputErrorf("invalid response with %d blobs, %d IDs", len(res.Blobs), len(res.EntityIDs))
   524  	}
   525  
   526  	// process each entity in the response
   527  	// NOTE: this requires engines to be somewhat idempotent, which is a good
   528  	// thing, as it increases the robustness of their code
   529  	for i := 0; i < len(res.Blobs); i++ {
   530  		blob := res.Blobs[i]
   531  		entityID := res.EntityIDs[i]
   532  
   533  		// the entity might already have been returned in another response
   534  		item, exists := e.items[entityID]
   535  		if !exists {
   536  			lg.Debug().Hex("entity_id", logging.ID(entityID)).Msg("entity not in items skipping")
   537  			continue
   538  		}
   539  
   540  		// create the entity with underlying concrete type and decode blob
   541  		entity := e.create()
   542  		err := msgpack.Unmarshal(blob, &entity)
   543  		if err != nil {
   544  			return fmt.Errorf("could not decode entity: %w", err)
   545  		}
   546  
   547  		if item.checkIntegrity {
   548  			actualEntityID := entity.ID()
   549  			// validate that we got correct entity, exactly what we were expecting
   550  			if entityID != actualEntityID {
   551  				lg.Error().
   552  					Hex("stated_entity_id", logging.ID(entityID)).
   553  					Hex("provided_entity", logging.ID(actualEntityID)).
   554  					Bool(logging.KeySuspicious, true).
   555  					Msg("provided entity does not match stated ID")
   556  				continue
   557  			}
   558  		}
   559  
   560  		// remove from needed items and pending items
   561  		delete(needed, entityID)
   562  		delete(e.items, entityID)
   563  
   564  		// process the entity
   565  		go e.handle(originID, entity)
   566  	}
   567  
   568  	// requeue requested entities that have not been delivered in the response
   569  	// NOTE: this logic allows a provider to send back an empty response to
   570  	// indicate that none of the requested entities are available, thus allowing
   571  	// the requester engine to immediately request them from another provider
   572  	for entityID := range needed {
   573  
   574  		// it's possible the item is unavailable, if it was already received
   575  		// in another response
   576  		item, exists := e.items[entityID]
   577  		if !exists {
   578  			// the entity could have been received in another request
   579  			continue
   580  		}
   581  
   582  		// we set the timestamp to zero, so that the item will be included
   583  		// in the next batch regardless of retry interval
   584  		item.LastRequested = time.Time{}
   585  	}
   586  
   587  	return nil
   588  }