github.com/koko1123/flow-go-1@v0.29.6/engine/common/requester/engine.go (about)

     1  package requester
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"math/rand"
     7  	"time"
     8  
     9  	"github.com/rs/zerolog"
    10  	"github.com/vmihailenco/msgpack"
    11  	"go.uber.org/atomic"
    12  
    13  	"github.com/koko1123/flow-go-1/engine"
    14  	"github.com/koko1123/flow-go-1/model/flow"
    15  	"github.com/koko1123/flow-go-1/model/flow/filter"
    16  	"github.com/koko1123/flow-go-1/model/messages"
    17  	"github.com/koko1123/flow-go-1/module"
    18  	"github.com/koko1123/flow-go-1/module/metrics"
    19  	"github.com/koko1123/flow-go-1/network"
    20  	"github.com/koko1123/flow-go-1/network/channels"
    21  	"github.com/koko1123/flow-go-1/state/protocol"
    22  	"github.com/koko1123/flow-go-1/utils/logging"
    23  )
    24  
    25  // HandleFunc is a function provided to the requester engine to handle an entity
    26  // once it has been retrieved from a provider. The function should be non-blocking
    27  // and errors should be handled internally within the function.
    28  type HandleFunc func(originID flow.Identifier, entity flow.Entity)
    29  
    30  // CreateFunc is a function that creates a `flow.Entity` with an underlying type
    31  // so that we can properly decode entities transmitted over the network.
    32  type CreateFunc func() flow.Entity
    33  
    34  // Engine is a generic requester engine, handling the requesting of entities
    35  // on the flow network. It is the `request` part of the request-reply
    36  // pattern provided by the pair of generic exchange engines.
    37  type Engine struct {
    38  	unit     *engine.Unit
    39  	log      zerolog.Logger
    40  	cfg      Config
    41  	metrics  module.EngineMetrics
    42  	me       module.Local
    43  	state    protocol.State
    44  	con      network.Conduit
    45  	channel  channels.Channel
    46  	selector flow.IdentityFilter
    47  	create   CreateFunc
    48  	handle   HandleFunc
    49  
    50  	// changing the following state variables must be guarded by unit.Lock()
    51  	items                 map[flow.Identifier]*Item
    52  	requests              map[uint64]*messages.EntityRequest
    53  	forcedDispatchOngoing *atomic.Bool // to ensure only trigger dispatching logic once at any time
    54  	rng                   *rand.Rand
    55  }
    56  
    57  // New creates a new requester engine, operating on the provided network channel, and requesting entities from a node
    58  // within the set obtained by applying the provided selector filter. The options allow customization of the parameters
    59  // related to the batch and retry logic.
    60  func New(log zerolog.Logger, metrics module.EngineMetrics, net network.Network, me module.Local, state protocol.State,
    61  	channel channels.Channel, selector flow.IdentityFilter, create CreateFunc, options ...OptionFunc) (*Engine, error) {
    62  
    63  	// initialize the default config
    64  	cfg := Config{
    65  		BatchThreshold:  32,
    66  		BatchInterval:   time.Second,
    67  		RetryInitial:    4 * time.Second,
    68  		RetryFunction:   RetryGeometric(2),
    69  		RetryMaximum:    2 * time.Minute,
    70  		RetryAttempts:   math.MaxUint32,
    71  		ValidateStaking: true,
    72  	}
    73  
    74  	// apply the custom option parameters
    75  	for _, option := range options {
    76  		option(&cfg)
    77  	}
    78  
    79  	// check validity of retry function
    80  	interval := cfg.RetryFunction(time.Second)
    81  	if interval < time.Second {
    82  		return nil, fmt.Errorf("invalid retry function (interval must always increase)")
    83  	}
    84  
    85  	// check validity of maximum interval
    86  	if cfg.RetryMaximum < cfg.RetryInitial {
    87  		return nil, fmt.Errorf("invalid retry maximum (must not be smaller than initial interval)")
    88  	}
    89  
    90  	// make sure we don't send requests from self
    91  	selector = filter.And(
    92  		selector,
    93  		filter.Not(filter.HasNodeID(me.NodeID())),
    94  		filter.Not(filter.Ejected),
    95  	)
    96  
    97  	// make sure we don't send requests to unauthorized nodes
    98  	if cfg.ValidateStaking {
    99  		selector = filter.And(
   100  			selector,
   101  			filter.HasWeight(true),
   102  		)
   103  	}
   104  
   105  	// initialize the propagation engine with its dependencies
   106  	e := &Engine{
   107  		unit:                  engine.NewUnit(),
   108  		log:                   log.With().Str("engine", "requester").Logger(),
   109  		cfg:                   cfg,
   110  		metrics:               metrics,
   111  		me:                    me,
   112  		state:                 state,
   113  		channel:               channel,
   114  		selector:              selector,
   115  		create:                create,
   116  		handle:                nil,
   117  		items:                 make(map[flow.Identifier]*Item),          // holds all pending items
   118  		requests:              make(map[uint64]*messages.EntityRequest), // holds all sent requests
   119  		forcedDispatchOngoing: atomic.NewBool(false),
   120  		rng:                   rand.New(rand.NewSource(time.Now().UnixNano())),
   121  	}
   122  
   123  	// register the engine with the network layer and store the conduit
   124  	con, err := net.Register(channels.Channel(channel), e)
   125  	if err != nil {
   126  		return nil, fmt.Errorf("could not register engine: %w", err)
   127  	}
   128  	e.con = con
   129  
   130  	return e, nil
   131  }
   132  
   133  // WithHandle sets the handle function of the requester, which is how it processes
   134  // returned entities. The engine can not be started without setting the handle
   135  // function. It is done in a separate call so that the requester can be injected
   136  // into engines upon construction, and then provide a handle function to the
   137  // requester from that engine itself.
   138  func (e *Engine) WithHandle(handle HandleFunc) *Engine {
   139  	e.handle = handle
   140  	return e
   141  }
   142  
   143  // Ready returns a ready channel that is closed once the engine has fully
   144  // started. For consensus engine, this is true once the underlying consensus
   145  // algorithm has started.
   146  func (e *Engine) Ready() <-chan struct{} {
   147  	if e.handle == nil {
   148  		panic("must initialize requester engine with handler")
   149  	}
   150  	e.unit.Launch(e.poll)
   151  	return e.unit.Ready()
   152  }
   153  
   154  // Done returns a done channel that is closed once the engine has fully stopped.
   155  // For the consensus engine, we wait for hotstuff to finish.
   156  func (e *Engine) Done() <-chan struct{} {
   157  	return e.unit.Done()
   158  }
   159  
   160  // SubmitLocal submits an message originating on the local node.
   161  func (e *Engine) SubmitLocal(message interface{}) {
   162  	e.unit.Launch(func() {
   163  		err := e.process(e.me.NodeID(), message)
   164  		if err != nil {
   165  			engine.LogError(e.log, err)
   166  		}
   167  	})
   168  }
   169  
   170  // Submit submits the given message from the node with the given origin ID
   171  // for processing in a non-blocking manner. It returns instantly and logs
   172  // a potential processing error internally when done.
   173  func (e *Engine) Submit(channel channels.Channel, originID flow.Identifier, message interface{}) {
   174  	e.unit.Launch(func() {
   175  		err := e.Process(channel, originID, message)
   176  		if err != nil {
   177  			engine.LogError(e.log, err)
   178  		}
   179  	})
   180  }
   181  
   182  // ProcessLocal processes an message originating on the local node.
   183  func (e *Engine) ProcessLocal(message interface{}) error {
   184  	return e.unit.Do(func() error {
   185  		return e.process(e.me.NodeID(), message)
   186  	})
   187  }
   188  
   189  // Process processes the given message from the node with the given origin ID in
   190  // a blocking manner. It returns the potential processing error when done.
   191  func (e *Engine) Process(channel channels.Channel, originID flow.Identifier, message interface{}) error {
   192  	return e.unit.Do(func() error {
   193  		return e.process(originID, message)
   194  	})
   195  }
   196  
   197  // EntityByID adds an entity to the list of entities to be requested from the
   198  // provider. It is idempotent, meaning that adding the same entity to the
   199  // requester engine multiple times has no effect, unless the item has
   200  // expired due to too many requests and has thus been deleted from the
   201  // list. The provided selector will be applied to the set of valid providers on top
   202  // of the global selector injected upon construction. It allows for finer-grained
   203  // control over which subset of providers to request a given entity from, such as
   204  // selection of a collection cluster. Use `filter.Any` if no additional selection
   205  // is required. Checks integrity of response to make sure that we got entity that we were requesting.
   206  func (e *Engine) EntityByID(entityID flow.Identifier, selector flow.IdentityFilter) {
   207  	e.addEntityRequest(entityID, selector, true)
   208  }
   209  
   210  // Query will request data through the request engine backing the interface.
   211  // The additional selector will be applied to the subset
   212  // of valid providers for the data and allows finer-grained control
   213  // over which providers to request data from. Doesn't perform integrity check
   214  // can be used to get entities without knowing their ID.
   215  func (e *Engine) Query(key flow.Identifier, selector flow.IdentityFilter) {
   216  	e.addEntityRequest(key, selector, false)
   217  }
   218  
   219  func (e *Engine) addEntityRequest(entityID flow.Identifier, selector flow.IdentityFilter, checkIntegrity bool) {
   220  	e.unit.Lock()
   221  	defer e.unit.Unlock()
   222  
   223  	// check if we already have an item for this entity
   224  	_, duplicate := e.items[entityID]
   225  	if duplicate {
   226  		return
   227  	}
   228  
   229  	// otherwise, add a new item to the list
   230  	item := &Item{
   231  		EntityID:       entityID,
   232  		NumAttempts:    0,
   233  		LastRequested:  time.Time{},
   234  		RetryAfter:     e.cfg.RetryInitial,
   235  		ExtraSelector:  selector,
   236  		checkIntegrity: checkIntegrity,
   237  	}
   238  	e.items[entityID] = item
   239  }
   240  
   241  // Force will force the requester engine to dispatch all currently
   242  // valid batch requests.
   243  func (e *Engine) Force() {
   244  	// exit early in case a forced dispatch is currently ongoing
   245  	if e.forcedDispatchOngoing.Load() {
   246  		return
   247  	}
   248  
   249  	// using Launch to ensure the caller won't be blocked
   250  	e.unit.Launch(func() {
   251  		// using atomic bool to ensure there is at most one caller would trigger dispatching requests
   252  		if e.forcedDispatchOngoing.CompareAndSwap(false, true) {
   253  			count := uint(0)
   254  			for {
   255  				dispatched, err := e.dispatchRequest()
   256  				if err != nil {
   257  					e.log.Error().Err(err).Msg("could not dispatch requests")
   258  					break
   259  				}
   260  				if !dispatched {
   261  					e.log.Debug().Uint("requests", count).Msg("forced request dispatch")
   262  					break
   263  				}
   264  				count++
   265  			}
   266  			e.forcedDispatchOngoing.Store(false)
   267  		}
   268  	})
   269  }
   270  
   271  func (e *Engine) poll() {
   272  	ticker := time.NewTicker(e.cfg.BatchInterval)
   273  
   274  PollLoop:
   275  	for {
   276  		select {
   277  		case <-e.unit.Quit():
   278  			break PollLoop
   279  
   280  		case <-ticker.C:
   281  			if e.forcedDispatchOngoing.Load() {
   282  				return
   283  			}
   284  
   285  			dispatched, err := e.dispatchRequest()
   286  			if err != nil {
   287  				e.log.Error().Err(err).Msg("could not dispatch requests")
   288  				continue PollLoop
   289  			}
   290  			if dispatched {
   291  				e.log.Debug().Uint("requests", 1).Msg("regular request dispatch")
   292  			}
   293  		}
   294  	}
   295  
   296  	ticker.Stop()
   297  }
   298  
   299  // dispatchRequest dispatches a subset of requests (selection based on internal heuristic).
   300  // While `dispatchRequest` sends a request (covering some but not necessarily all items),
   301  // if and only if there is something to request. In other words it cannot happen that
   302  // `dispatchRequest` sends no request, but there is something to be requested.
   303  // The boolean return value indicates whether a request was dispatched at all.
   304  func (e *Engine) dispatchRequest() (bool, error) {
   305  
   306  	e.unit.Lock()
   307  	defer e.unit.Unlock()
   308  
   309  	e.log.Debug().Int("num_entities", len(e.items)).Msg("selecting entities")
   310  
   311  	// get the current top-level set of valid providers
   312  	providers, err := e.state.Final().Identities(e.selector)
   313  	if err != nil {
   314  		return false, fmt.Errorf("could not get providers: %w", err)
   315  	}
   316  
   317  	// randomize order of items, so that they can be requested in different order each time
   318  	rndItems := make([]flow.Identifier, 0, len(e.items))
   319  	for k := range e.items {
   320  		rndItems = append(rndItems, e.items[k].EntityID)
   321  	}
   322  	e.rng.Shuffle(len(rndItems), func(i, j int) { rndItems[i], rndItems[j] = rndItems[j], rndItems[i] })
   323  
   324  	// go through each item and decide if it should be requested again
   325  	now := time.Now().UTC()
   326  	var providerID flow.Identifier
   327  	var entityIDs []flow.Identifier
   328  	for _, entityID := range rndItems {
   329  		item := e.items[entityID]
   330  
   331  		// if the item should not be requested yet, ignore
   332  		cutoff := item.LastRequested.Add(item.RetryAfter)
   333  		if cutoff.After(now) {
   334  			continue
   335  		}
   336  
   337  		// if the item reached maximum amount of retries, drop
   338  		if item.NumAttempts >= e.cfg.RetryAttempts {
   339  			e.log.Debug().Str("entity_id", entityID.String()).Msg("dropping entity ID max amount of retries reached")
   340  			delete(e.items, entityID)
   341  			continue
   342  		}
   343  
   344  		// if the provider has already been chosen, check if this item
   345  		// can be requested from the same provider; otherwise skip it
   346  		// for now, so it will be part of the next batch request
   347  		if providerID != flow.ZeroID {
   348  			overlap := providers.Filter(filter.And(
   349  				filter.HasNodeID(providerID),
   350  				item.ExtraSelector,
   351  			))
   352  			if len(overlap) == 0 {
   353  				continue
   354  			}
   355  		}
   356  
   357  		// if no provider has been chosen yet, choose from restricted set
   358  		// NOTE: a single item can not permanently block requests going
   359  		// out when no providers are available for it, because the iteration
   360  		// order is random and will skip the item most of the times
   361  		// when other items are available
   362  		if providerID == flow.ZeroID {
   363  			providers = providers.Filter(item.ExtraSelector)
   364  			if len(providers) == 0 {
   365  				return false, fmt.Errorf("no valid providers available")
   366  			}
   367  			providerID = providers.Sample(1)[0].NodeID
   368  		}
   369  
   370  		// add item to list and set retry parameters
   371  		// NOTE: we add the retry interval to the last requested timestamp,
   372  		// rather than using the current timestamp, in order to conserve a
   373  		// more even distribution of timestamps over time, which should lead
   374  		// to a more even distribution of entities over batch requests
   375  		entityIDs = append(entityIDs, entityID)
   376  		item.NumAttempts++
   377  		item.LastRequested = now
   378  		item.RetryAfter = e.cfg.RetryFunction(item.RetryAfter)
   379  
   380  		// make sure the interval is within parameters
   381  		if item.RetryAfter < e.cfg.RetryInitial {
   382  			item.RetryAfter = e.cfg.RetryInitial
   383  		}
   384  		if item.RetryAfter > e.cfg.RetryMaximum {
   385  			item.RetryAfter = e.cfg.RetryMaximum
   386  		}
   387  
   388  		// if we reached the maximum size for a batch, bail
   389  		if uint(len(entityIDs)) >= e.cfg.BatchThreshold {
   390  			break
   391  		}
   392  	}
   393  
   394  	// if there are no items to request, return
   395  	if len(entityIDs) == 0 {
   396  		return false, nil
   397  	}
   398  
   399  	// create a batch request, send it and store it for reference
   400  	req := &messages.EntityRequest{
   401  		Nonce:     e.rng.Uint64(),
   402  		EntityIDs: entityIDs,
   403  	}
   404  
   405  	requestStart := time.Now()
   406  
   407  	if e.log.Debug().Enabled() {
   408  		e.log.Debug().
   409  			Hex("provider", logging.ID(providerID)).
   410  			Uint64("nonce", req.Nonce).
   411  			Int("num_selected", len(entityIDs)).
   412  			Strs("entities", logging.IDs(entityIDs)).
   413  			Msg("sending entity request")
   414  	}
   415  
   416  	err = e.con.Unicast(req, providerID)
   417  	if err != nil {
   418  		return true, fmt.Errorf("could not send request for entities %v: %w", logging.IDs(entityIDs), err)
   419  	}
   420  	e.requests[req.Nonce] = req
   421  
   422  	if e.log.Debug().Enabled() {
   423  		e.log.Debug().
   424  			Hex("provider", logging.ID(providerID)).
   425  			Uint64("nonce", req.Nonce).
   426  			Strs("entities", logging.IDs(entityIDs)).
   427  			TimeDiff("duration", time.Now(), requestStart).
   428  			Msg("entity request sent")
   429  	}
   430  
   431  	// NOTE: we forget about requests after the expiry of the shortest retry time
   432  	// from the entities in the list; this means that we purge requests aggressively.
   433  	// However, most requests should be responded to on the first attempt and clearing
   434  	// these up only removes the ability to instantly retry upon partial responses, so
   435  	// it won't affect much.
   436  	go func() {
   437  		<-time.After(e.cfg.RetryInitial)
   438  
   439  		e.unit.Lock()
   440  		defer e.unit.Unlock()
   441  		delete(e.requests, req.Nonce)
   442  	}()
   443  
   444  	e.metrics.MessageSent(e.channel.String(), metrics.MessageEntityRequest)
   445  	e.log.Debug().
   446  		Uint64("nonce", req.Nonce).
   447  		Strs("entity_ids", flow.IdentifierList(req.EntityIDs).Strings()).
   448  		Msg("entity request sent")
   449  
   450  	return true, nil
   451  }
   452  
   453  // process processes events for the propagation engine on the consensus node.
   454  func (e *Engine) process(originID flow.Identifier, message interface{}) error {
   455  
   456  	e.metrics.MessageReceived(e.channel.String(), metrics.MessageEntityResponse)
   457  	defer e.metrics.MessageHandled(e.channel.String(), metrics.MessageEntityResponse)
   458  
   459  	switch msg := message.(type) {
   460  	case *messages.EntityResponse:
   461  		return e.onEntityResponse(originID, msg)
   462  	default:
   463  		return engine.NewInvalidInputErrorf("invalid message type (%T)", message)
   464  	}
   465  }
   466  
   467  func (e *Engine) onEntityResponse(originID flow.Identifier, res *messages.EntityResponse) error {
   468  	lg := e.log.With().Str("origin_id", originID.String()).Uint64("nonce", res.Nonce).Logger()
   469  
   470  	lg.Debug().Strs("entity_ids", flow.IdentifierList(res.EntityIDs).Strings()).Msg("entity response received")
   471  
   472  	if e.cfg.ValidateStaking {
   473  
   474  		// check that the response comes from a valid provider
   475  		providers, err := e.state.Final().Identities(filter.And(
   476  			e.selector,
   477  			filter.HasNodeID(originID),
   478  		))
   479  		if err != nil {
   480  			return fmt.Errorf("could not get providers: %w", err)
   481  		}
   482  		if len(providers) == 0 {
   483  			return engine.NewInvalidInputErrorf("invalid provider origin (%x)", originID)
   484  		}
   485  	}
   486  
   487  	if e.log.Debug().Enabled() {
   488  		e.log.Debug().
   489  			Hex("provider", logging.ID(originID)).
   490  			Strs("entities", logging.IDs(res.EntityIDs)).
   491  			Uint64("nonce", res.Nonce).
   492  			Msg("onEntityResponse entries received")
   493  	}
   494  
   495  	e.unit.Lock()
   496  	defer e.unit.Unlock()
   497  
   498  	// build a list of needed entities; if not available, process anyway,
   499  	// but in that case we can't re-queue missing items
   500  	needed := make(map[flow.Identifier]struct{})
   501  	req, exists := e.requests[res.Nonce]
   502  	if exists {
   503  		delete(e.requests, req.Nonce)
   504  		for _, entityID := range req.EntityIDs {
   505  			needed[entityID] = struct{}{}
   506  		}
   507  	}
   508  
   509  	// ensure the response is correctly formed
   510  	if len(res.Blobs) != len(res.EntityIDs) {
   511  		return engine.NewInvalidInputErrorf("invalid response with %d blobs, %d IDs", len(res.Blobs), len(res.EntityIDs))
   512  	}
   513  
   514  	// process each entity in the response
   515  	// NOTE: this requires engines to be somewhat idempotent, which is a good
   516  	// thing, as it increases the robustness of their code
   517  	for i := 0; i < len(res.Blobs); i++ {
   518  		blob := res.Blobs[i]
   519  		entityID := res.EntityIDs[i]
   520  
   521  		// the entity might already have been returned in another response
   522  		item, exists := e.items[entityID]
   523  		if !exists {
   524  			lg.Debug().Hex("entity_id", logging.ID(entityID)).Msg("entity not in items skipping")
   525  			continue
   526  		}
   527  
   528  		// create the entity with underlying concrete type and decode blob
   529  		entity := e.create()
   530  		err := msgpack.Unmarshal(blob, &entity)
   531  		if err != nil {
   532  			return fmt.Errorf("could not decode entity: %w", err)
   533  		}
   534  
   535  		if item.checkIntegrity {
   536  			actualEntityID := entity.ID()
   537  			// validate that we got correct entity, exactly what we were expecting
   538  			if entityID != actualEntityID {
   539  				lg.Error().
   540  					Hex("stated_entity_id", logging.ID(entityID)).
   541  					Hex("provided_entity", logging.ID(actualEntityID)).
   542  					Bool(logging.KeySuspicious, true).
   543  					Msg("provided entity does not match stated ID")
   544  				continue
   545  			}
   546  		}
   547  
   548  		// remove from needed items and pending items
   549  		delete(needed, entityID)
   550  		delete(e.items, entityID)
   551  
   552  		// process the entity
   553  		go e.handle(originID, entity)
   554  	}
   555  
   556  	// requeue requested entities that have not been delivered in the response
   557  	// NOTE: this logic allows a provider to send back an empty response to
   558  	// indicate that none of the requested entities are available, thus allowing
   559  	// the requester engine to immediately request them from another provider
   560  	for entityID := range needed {
   561  
   562  		// it's possible the item is unavailable, if it was already received
   563  		// in another response
   564  		item, exists := e.items[entityID]
   565  		if !exists {
   566  			// the entity could have been received in another request
   567  			continue
   568  		}
   569  
   570  		// we set the timestamp to zero, so that the item will be included
   571  		// in the next batch regardless of retry interval
   572  		item.LastRequested = time.Time{}
   573  	}
   574  
   575  	return nil
   576  }