github.com/onflow/flow-go@v0.33.17/engine/common/requester/engine.go (about)

     1  package requester
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"time"
     7  
     8  	"github.com/rs/zerolog"
     9  	"github.com/vmihailenco/msgpack"
    10  	"go.uber.org/atomic"
    11  
    12  	"github.com/onflow/flow-go/engine"
    13  	"github.com/onflow/flow-go/model/flow"
    14  	"github.com/onflow/flow-go/model/flow/filter"
    15  	"github.com/onflow/flow-go/model/messages"
    16  	"github.com/onflow/flow-go/module"
    17  	"github.com/onflow/flow-go/module/metrics"
    18  	"github.com/onflow/flow-go/network"
    19  	"github.com/onflow/flow-go/network/channels"
    20  	"github.com/onflow/flow-go/state/protocol"
    21  	"github.com/onflow/flow-go/utils/logging"
    22  	"github.com/onflow/flow-go/utils/rand"
    23  )
    24  
    25  // HandleFunc is a function provided to the requester engine to handle an entity
    26  // once it has been retrieved from a provider. The function should be non-blocking
    27  // and errors should be handled internally within the function.
    28  type HandleFunc func(originID flow.Identifier, entity flow.Entity)
    29  
    30  // CreateFunc is a function that creates a `flow.Entity` with an underlying type
    31  // so that we can properly decode entities transmitted over the network.
    32  type CreateFunc func() flow.Entity
    33  
    34  // Engine is a generic requester engine, handling the requesting of entities
    35  // on the flow network. It is the `request` part of the request-reply
    36  // pattern provided by the pair of generic exchange engines.
    37  type Engine struct {
    38  	unit     *engine.Unit
    39  	log      zerolog.Logger
    40  	cfg      Config
    41  	metrics  module.EngineMetrics
    42  	me       module.Local
    43  	state    protocol.State
    44  	con      network.Conduit
    45  	channel  channels.Channel
    46  	selector flow.IdentityFilter
    47  	create   CreateFunc
    48  	handle   HandleFunc
    49  
    50  	// changing the following state variables must be guarded by unit.Lock()
    51  	items                 map[flow.Identifier]*Item
    52  	requests              map[uint64]*messages.EntityRequest
    53  	forcedDispatchOngoing *atomic.Bool // to ensure only trigger dispatching logic once at any time
    54  }
    55  
    56  // New creates a new requester engine, operating on the provided network channel, and requesting entities from a node
    57  // within the set obtained by applying the provided selector filter. The options allow customization of the parameters
    58  // related to the batch and retry logic.
    59  func New(log zerolog.Logger, metrics module.EngineMetrics, net network.EngineRegistry, me module.Local, state protocol.State,
    60  	channel channels.Channel, selector flow.IdentityFilter, create CreateFunc, options ...OptionFunc) (*Engine, error) {
    61  
    62  	// initialize the default config
    63  	cfg := Config{
    64  		BatchThreshold:  32,
    65  		BatchInterval:   time.Second,
    66  		RetryInitial:    4 * time.Second,
    67  		RetryFunction:   RetryGeometric(2),
    68  		RetryMaximum:    2 * time.Minute,
    69  		RetryAttempts:   math.MaxUint32,
    70  		ValidateStaking: true,
    71  	}
    72  
    73  	// apply the custom option parameters
    74  	for _, option := range options {
    75  		option(&cfg)
    76  	}
    77  
    78  	// check validity of retry function
    79  	interval := cfg.RetryFunction(time.Second)
    80  	if interval < time.Second {
    81  		return nil, fmt.Errorf("invalid retry function (interval must always increase)")
    82  	}
    83  
    84  	// check validity of maximum interval
    85  	if cfg.RetryMaximum < cfg.RetryInitial {
    86  		return nil, fmt.Errorf("invalid retry maximum (must not be smaller than initial interval)")
    87  	}
    88  
    89  	// make sure we don't send requests from self
    90  	selector = filter.And(
    91  		selector,
    92  		filter.Not(filter.HasNodeID(me.NodeID())),
    93  		filter.Not(filter.Ejected),
    94  	)
    95  
    96  	// make sure we don't send requests to unauthorized nodes
    97  	if cfg.ValidateStaking {
    98  		selector = filter.And(
    99  			selector,
   100  			filter.HasWeight(true),
   101  		)
   102  	}
   103  
   104  	// initialize the propagation engine with its dependencies
   105  	e := &Engine{
   106  		unit:                  engine.NewUnit(),
   107  		log:                   log.With().Str("engine", "requester").Logger(),
   108  		cfg:                   cfg,
   109  		metrics:               metrics,
   110  		me:                    me,
   111  		state:                 state,
   112  		channel:               channel,
   113  		selector:              selector,
   114  		create:                create,
   115  		handle:                nil,
   116  		items:                 make(map[flow.Identifier]*Item),          // holds all pending items
   117  		requests:              make(map[uint64]*messages.EntityRequest), // holds all sent requests
   118  		forcedDispatchOngoing: atomic.NewBool(false),
   119  	}
   120  
   121  	// register the engine with the network layer and store the conduit
   122  	con, err := net.Register(channels.Channel(channel), e)
   123  	if err != nil {
   124  		return nil, fmt.Errorf("could not register engine: %w", err)
   125  	}
   126  	e.con = con
   127  
   128  	return e, nil
   129  }
   130  
   131  // WithHandle sets the handle function of the requester, which is how it processes
   132  // returned entities. The engine can not be started without setting the handle
   133  // function. It is done in a separate call so that the requester can be injected
   134  // into engines upon construction, and then provide a handle function to the
   135  // requester from that engine itself.
   136  func (e *Engine) WithHandle(handle HandleFunc) *Engine {
   137  	e.handle = handle
   138  	return e
   139  }
   140  
   141  // Ready returns a ready channel that is closed once the engine has fully
   142  // started. For consensus engine, this is true once the underlying consensus
   143  // algorithm has started.
   144  func (e *Engine) Ready() <-chan struct{} {
   145  	if e.handle == nil {
   146  		panic("must initialize requester engine with handler")
   147  	}
   148  	e.unit.Launch(e.poll)
   149  	return e.unit.Ready()
   150  }
   151  
   152  // Done returns a done channel that is closed once the engine has fully stopped.
   153  // For the consensus engine, we wait for hotstuff to finish.
   154  func (e *Engine) Done() <-chan struct{} {
   155  	return e.unit.Done()
   156  }
   157  
   158  // SubmitLocal submits an message originating on the local node.
   159  func (e *Engine) SubmitLocal(message interface{}) {
   160  	e.unit.Launch(func() {
   161  		err := e.process(e.me.NodeID(), message)
   162  		if err != nil {
   163  			engine.LogError(e.log, err)
   164  		}
   165  	})
   166  }
   167  
   168  // Submit submits the given message from the node with the given origin ID
   169  // for processing in a non-blocking manner. It returns instantly and logs
   170  // a potential processing error internally when done.
   171  func (e *Engine) Submit(channel channels.Channel, originID flow.Identifier, message interface{}) {
   172  	e.unit.Launch(func() {
   173  		err := e.Process(channel, originID, message)
   174  		if err != nil {
   175  			engine.LogError(e.log, err)
   176  		}
   177  	})
   178  }
   179  
   180  // ProcessLocal processes an message originating on the local node.
   181  func (e *Engine) ProcessLocal(message interface{}) error {
   182  	return e.unit.Do(func() error {
   183  		return e.process(e.me.NodeID(), message)
   184  	})
   185  }
   186  
   187  // Process processes the given message from the node with the given origin ID in
   188  // a blocking manner. It returns the potential processing error when done.
   189  func (e *Engine) Process(channel channels.Channel, originID flow.Identifier, message interface{}) error {
   190  	return e.unit.Do(func() error {
   191  		return e.process(originID, message)
   192  	})
   193  }
   194  
   195  // EntityByID adds an entity to the list of entities to be requested from the
   196  // provider. It is idempotent, meaning that adding the same entity to the
   197  // requester engine multiple times has no effect, unless the item has
   198  // expired due to too many requests and has thus been deleted from the
   199  // list. The provided selector will be applied to the set of valid providers on top
   200  // of the global selector injected upon construction. It allows for finer-grained
   201  // control over which subset of providers to request a given entity from, such as
   202  // selection of a collection cluster. Use `filter.Any` if no additional selection
   203  // is required. Checks integrity of response to make sure that we got entity that we were requesting.
   204  func (e *Engine) EntityByID(entityID flow.Identifier, selector flow.IdentityFilter) {
   205  	e.addEntityRequest(entityID, selector, true)
   206  }
   207  
   208  // Query will request data through the request engine backing the interface.
   209  // The additional selector will be applied to the subset
   210  // of valid providers for the data and allows finer-grained control
   211  // over which providers to request data from. Doesn't perform integrity check
   212  // can be used to get entities without knowing their ID.
   213  func (e *Engine) Query(key flow.Identifier, selector flow.IdentityFilter) {
   214  	e.addEntityRequest(key, selector, false)
   215  }
   216  
   217  func (e *Engine) addEntityRequest(entityID flow.Identifier, selector flow.IdentityFilter, checkIntegrity bool) {
   218  	e.unit.Lock()
   219  	defer e.unit.Unlock()
   220  
   221  	// check if we already have an item for this entity
   222  	_, duplicate := e.items[entityID]
   223  	if duplicate {
   224  		return
   225  	}
   226  
   227  	// otherwise, add a new item to the list
   228  	item := &Item{
   229  		EntityID:       entityID,
   230  		NumAttempts:    0,
   231  		LastRequested:  time.Time{},
   232  		RetryAfter:     e.cfg.RetryInitial,
   233  		ExtraSelector:  selector,
   234  		checkIntegrity: checkIntegrity,
   235  	}
   236  	e.items[entityID] = item
   237  }
   238  
   239  // Force will force the requester engine to dispatch all currently
   240  // valid batch requests.
   241  func (e *Engine) Force() {
   242  	// exit early in case a forced dispatch is currently ongoing
   243  	if e.forcedDispatchOngoing.Load() {
   244  		return
   245  	}
   246  
   247  	// using Launch to ensure the caller won't be blocked
   248  	e.unit.Launch(func() {
   249  		// using atomic bool to ensure there is at most one caller would trigger dispatching requests
   250  		if e.forcedDispatchOngoing.CompareAndSwap(false, true) {
   251  			count := uint(0)
   252  			for {
   253  				dispatched, err := e.dispatchRequest()
   254  				if err != nil {
   255  					e.log.Error().Err(err).Msg("could not dispatch requests")
   256  					break
   257  				}
   258  				if !dispatched {
   259  					e.log.Debug().Uint("requests", count).Msg("forced request dispatch")
   260  					break
   261  				}
   262  				count++
   263  			}
   264  			e.forcedDispatchOngoing.Store(false)
   265  		}
   266  	})
   267  }
   268  
   269  func (e *Engine) poll() {
   270  	ticker := time.NewTicker(e.cfg.BatchInterval)
   271  
   272  PollLoop:
   273  	for {
   274  		select {
   275  		case <-e.unit.Quit():
   276  			break PollLoop
   277  
   278  		case <-ticker.C:
   279  			if e.forcedDispatchOngoing.Load() {
   280  				return
   281  			}
   282  
   283  			dispatched, err := e.dispatchRequest()
   284  			if err != nil {
   285  				e.log.Error().Err(err).Msg("could not dispatch requests")
   286  				continue PollLoop
   287  			}
   288  			if dispatched {
   289  				e.log.Debug().Uint("requests", 1).Msg("regular request dispatch")
   290  			}
   291  		}
   292  	}
   293  
   294  	ticker.Stop()
   295  }
   296  
   297  // dispatchRequest dispatches a subset of requests (selection based on internal heuristic).
   298  // While `dispatchRequest` sends a request (covering some but not necessarily all items),
   299  // if and only if there is something to request. In other words it cannot happen that
   300  // `dispatchRequest` sends no request, but there is something to be requested.
   301  // The boolean return value indicates whether a request was dispatched at all.
   302  func (e *Engine) dispatchRequest() (bool, error) {
   303  
   304  	e.unit.Lock()
   305  	defer e.unit.Unlock()
   306  
   307  	e.log.Debug().Int("num_entities", len(e.items)).Msg("selecting entities")
   308  
   309  	// get the current top-level set of valid providers
   310  	providers, err := e.state.Final().Identities(e.selector)
   311  	if err != nil {
   312  		return false, fmt.Errorf("could not get providers: %w", err)
   313  	}
   314  
   315  	// randomize order of items, so that they can be requested in different order each time
   316  	rndItems := make([]flow.Identifier, 0, len(e.items))
   317  	for k := range e.items {
   318  		rndItems = append(rndItems, e.items[k].EntityID)
   319  	}
   320  	err = rand.Shuffle(uint(len(rndItems)), func(i, j uint) {
   321  		rndItems[i], rndItems[j] = rndItems[j], rndItems[i]
   322  	})
   323  	if err != nil {
   324  		return false, fmt.Errorf("shuffle failed: %w", err)
   325  	}
   326  
   327  	// go through each item and decide if it should be requested again
   328  	now := time.Now().UTC()
   329  	var providerID flow.Identifier
   330  	var entityIDs []flow.Identifier
   331  	for _, entityID := range rndItems {
   332  		item := e.items[entityID]
   333  
   334  		// if the item should not be requested yet, ignore
   335  		cutoff := item.LastRequested.Add(item.RetryAfter)
   336  		if cutoff.After(now) {
   337  			continue
   338  		}
   339  
   340  		// if the item reached maximum amount of retries, drop
   341  		if item.NumAttempts >= e.cfg.RetryAttempts {
   342  			e.log.Debug().Str("entity_id", entityID.String()).Msg("dropping entity ID max amount of retries reached")
   343  			delete(e.items, entityID)
   344  			continue
   345  		}
   346  
   347  		// if the provider has already been chosen, check if this item
   348  		// can be requested from the same provider; otherwise skip it
   349  		// for now, so it will be part of the next batch request
   350  		if providerID != flow.ZeroID {
   351  			overlap := providers.Filter(filter.And(
   352  				filter.HasNodeID(providerID),
   353  				item.ExtraSelector,
   354  			))
   355  			if len(overlap) == 0 {
   356  				continue
   357  			}
   358  		}
   359  
   360  		// if no provider has been chosen yet, choose from restricted set
   361  		// NOTE: a single item can not permanently block requests going
   362  		// out when no providers are available for it, because the iteration
   363  		// order is random and will skip the item most of the times
   364  		// when other items are available
   365  		if providerID == flow.ZeroID {
   366  			providers = providers.Filter(item.ExtraSelector)
   367  			if len(providers) == 0 {
   368  				return false, fmt.Errorf("no valid providers available")
   369  			}
   370  			id, err := providers.Sample(1)
   371  			if err != nil {
   372  				return false, fmt.Errorf("sampling failed: %w", err)
   373  			}
   374  			providerID = id[0].NodeID
   375  		}
   376  
   377  		// add item to list and set retry parameters
   378  		// NOTE: we add the retry interval to the last requested timestamp,
   379  		// rather than using the current timestamp, in order to conserve a
   380  		// more even distribution of timestamps over time, which should lead
   381  		// to a more even distribution of entities over batch requests
   382  		entityIDs = append(entityIDs, entityID)
   383  		item.NumAttempts++
   384  		item.LastRequested = now
   385  		item.RetryAfter = e.cfg.RetryFunction(item.RetryAfter)
   386  
   387  		// make sure the interval is within parameters
   388  		if item.RetryAfter < e.cfg.RetryInitial {
   389  			item.RetryAfter = e.cfg.RetryInitial
   390  		}
   391  		if item.RetryAfter > e.cfg.RetryMaximum {
   392  			item.RetryAfter = e.cfg.RetryMaximum
   393  		}
   394  
   395  		// if we reached the maximum size for a batch, bail
   396  		if uint(len(entityIDs)) >= e.cfg.BatchThreshold {
   397  			break
   398  		}
   399  	}
   400  
   401  	// if there are no items to request, return
   402  	if len(entityIDs) == 0 {
   403  		return false, nil
   404  	}
   405  
   406  	nonce, err := rand.Uint64()
   407  	if err != nil {
   408  		return false, fmt.Errorf("nonce generation failed: %w", err)
   409  	}
   410  
   411  	// create a batch request, send it and store it for reference
   412  	req := &messages.EntityRequest{
   413  		Nonce:     nonce,
   414  		EntityIDs: entityIDs,
   415  	}
   416  
   417  	requestStart := time.Now()
   418  
   419  	if e.log.Debug().Enabled() {
   420  		e.log.Debug().
   421  			Hex("provider", logging.ID(providerID)).
   422  			Uint64("nonce", req.Nonce).
   423  			Int("num_selected", len(entityIDs)).
   424  			Strs("entities", logging.IDs(entityIDs)).
   425  			Msg("sending entity request")
   426  	}
   427  
   428  	err = e.con.Unicast(req, providerID)
   429  	if err != nil {
   430  		return true, fmt.Errorf("could not send request for entities %v: %w", logging.IDs(entityIDs), err)
   431  	}
   432  	e.requests[req.Nonce] = req
   433  
   434  	if e.log.Debug().Enabled() {
   435  		e.log.Debug().
   436  			Hex("provider", logging.ID(providerID)).
   437  			Uint64("nonce", req.Nonce).
   438  			Strs("entities", logging.IDs(entityIDs)).
   439  			TimeDiff("duration", time.Now(), requestStart).
   440  			Msg("entity request sent")
   441  	}
   442  
   443  	// NOTE: we forget about requests after the expiry of the shortest retry time
   444  	// from the entities in the list; this means that we purge requests aggressively.
   445  	// However, most requests should be responded to on the first attempt and clearing
   446  	// these up only removes the ability to instantly retry upon partial responses, so
   447  	// it won't affect much.
   448  	go func() {
   449  		<-time.After(e.cfg.RetryInitial)
   450  
   451  		e.unit.Lock()
   452  		defer e.unit.Unlock()
   453  		delete(e.requests, req.Nonce)
   454  	}()
   455  
   456  	e.metrics.MessageSent(e.channel.String(), metrics.MessageEntityRequest)
   457  	e.log.Debug().
   458  		Uint64("nonce", req.Nonce).
   459  		Strs("entity_ids", flow.IdentifierList(req.EntityIDs).Strings()).
   460  		Msg("entity request sent")
   461  
   462  	return true, nil
   463  }
   464  
   465  // process processes events for the propagation engine on the consensus node.
   466  func (e *Engine) process(originID flow.Identifier, message interface{}) error {
   467  
   468  	e.metrics.MessageReceived(e.channel.String(), metrics.MessageEntityResponse)
   469  	defer e.metrics.MessageHandled(e.channel.String(), metrics.MessageEntityResponse)
   470  
   471  	switch msg := message.(type) {
   472  	case *messages.EntityResponse:
   473  		return e.onEntityResponse(originID, msg)
   474  	default:
   475  		return engine.NewInvalidInputErrorf("invalid message type (%T)", message)
   476  	}
   477  }
   478  
   479  func (e *Engine) onEntityResponse(originID flow.Identifier, res *messages.EntityResponse) error {
   480  	lg := e.log.With().Str("origin_id", originID.String()).Uint64("nonce", res.Nonce).Logger()
   481  
   482  	lg.Debug().Strs("entity_ids", flow.IdentifierList(res.EntityIDs).Strings()).Msg("entity response received")
   483  
   484  	if e.cfg.ValidateStaking {
   485  
   486  		// check that the response comes from a valid provider
   487  		providers, err := e.state.Final().Identities(filter.And(
   488  			e.selector,
   489  			filter.HasNodeID(originID),
   490  		))
   491  		if err != nil {
   492  			return fmt.Errorf("could not get providers: %w", err)
   493  		}
   494  		if len(providers) == 0 {
   495  			return engine.NewInvalidInputErrorf("invalid provider origin (%x)", originID)
   496  		}
   497  	}
   498  
   499  	if e.log.Debug().Enabled() {
   500  		e.log.Debug().
   501  			Hex("provider", logging.ID(originID)).
   502  			Strs("entities", logging.IDs(res.EntityIDs)).
   503  			Uint64("nonce", res.Nonce).
   504  			Msg("onEntityResponse entries received")
   505  	}
   506  
   507  	e.unit.Lock()
   508  	defer e.unit.Unlock()
   509  
   510  	// build a list of needed entities; if not available, process anyway,
   511  	// but in that case we can't re-queue missing items
   512  	needed := make(map[flow.Identifier]struct{})
   513  	req, exists := e.requests[res.Nonce]
   514  	if exists {
   515  		delete(e.requests, req.Nonce)
   516  		for _, entityID := range req.EntityIDs {
   517  			needed[entityID] = struct{}{}
   518  		}
   519  	}
   520  
   521  	// ensure the response is correctly formed
   522  	if len(res.Blobs) != len(res.EntityIDs) {
   523  		return engine.NewInvalidInputErrorf("invalid response with %d blobs, %d IDs", len(res.Blobs), len(res.EntityIDs))
   524  	}
   525  
   526  	// process each entity in the response
   527  	// NOTE: this requires engines to be somewhat idempotent, which is a good
   528  	// thing, as it increases the robustness of their code
   529  	for i := 0; i < len(res.Blobs); i++ {
   530  		blob := res.Blobs[i]
   531  		entityID := res.EntityIDs[i]
   532  
   533  		// the entity might already have been returned in another response
   534  		item, exists := e.items[entityID]
   535  		if !exists {
   536  			lg.Debug().Hex("entity_id", logging.ID(entityID)).Msg("entity not in items skipping")
   537  			continue
   538  		}
   539  
   540  		// create the entity with underlying concrete type and decode blob
   541  		entity := e.create()
   542  		err := msgpack.Unmarshal(blob, &entity)
   543  		if err != nil {
   544  			return fmt.Errorf("could not decode entity: %w", err)
   545  		}
   546  
   547  		if item.checkIntegrity {
   548  			actualEntityID := entity.ID()
   549  			// validate that we got correct entity, exactly what we were expecting
   550  			if entityID != actualEntityID {
   551  				lg.Error().
   552  					Hex("stated_entity_id", logging.ID(entityID)).
   553  					Hex("provided_entity", logging.ID(actualEntityID)).
   554  					Bool(logging.KeySuspicious, true).
   555  					Msg("provided entity does not match stated ID")
   556  				continue
   557  			}
   558  		}
   559  
   560  		// remove from needed items and pending items
   561  		delete(needed, entityID)
   562  		delete(e.items, entityID)
   563  
   564  		// process the entity
   565  		go e.handle(originID, entity)
   566  	}
   567  
   568  	// requeue requested entities that have not been delivered in the response
   569  	// NOTE: this logic allows a provider to send back an empty response to
   570  	// indicate that none of the requested entities are available, thus allowing
   571  	// the requester engine to immediately request them from another provider
   572  	for entityID := range needed {
   573  
   574  		// it's possible the item is unavailable, if it was already received
   575  		// in another response
   576  		item, exists := e.items[entityID]
   577  		if !exists {
   578  			// the entity could have been received in another request
   579  			continue
   580  		}
   581  
   582  		// we set the timestamp to zero, so that the item will be included
   583  		// in the next batch regardless of retry interval
   584  		item.LastRequested = time.Time{}
   585  	}
   586  
   587  	return nil
   588  }